Create sample WARC files
Result:
ITS-WARC-EXAMPLE-WGET-20240304204943.warc
Steps:
- Create a web server with sample content
- Create a
.warcvia GNU Wget
Response bodies
| Context | Response body |
|---|---|
/foo/* | "Foo\n" (4 bytes) |
/ten-thousand-lines/* | "<br />\n" * 10e3 (70,000 bytes) |
| 404 page | "Not found\n" (10 bytes) |
| 500 page | "Something went wrong\n" (21 bytes) |
Routes with various encoding combinations
Content-Encoding: none,gzip, orbrTransfer-Encoding: none, orchunked
| Path | CE | TE |
|---|---|---|
/foo/none-none | ||
/foo/none-chunked | chunked | |
/foo/gzip-none | gzip | |
/foo/gzip-chunked | gzip | chunked |
/foo/br-none | br | |
/foo/br-chunked | br | chunked |
/ten-thousand-lines/none-none | ||
/ten-thousand-lines/none-chunked | chunked | |
/ten-thousand-lines/gzip-none | gzip | |
/ten-thousand-lines/gzip-chunked | gzip | chunked |
/ten-thousand-lines/br-none | br | |
/ten-thousand-lines/br-chunked | br | chunked |
/404 |
The first time each path is requested, serve the 500 page
For each path, let's have the the sample server respond with the 500 page the first time that you go to that path:
...
if (counters[req.url] == 0) {
res.writeHead(500, {
...
This will let us easily have both 500 and non-500 responses for HTTP requests in the .warc file.
Create a WARC file via GNU Wget
Let's get each URL twice (once for the 500, then once again):
cat urls.txt urls.txt | \
wget \
--warc-file ./ITS-WARC-EXAMPLE-WGET-`node -e 'process.stdout.write((new Date()).toISOString().substring(0, 19).replace(/[T:-]/g, ""))'` \
--no-warc-compression \
--wait=1.0 \
-O /dev/null \
-i -
When you invoke cat with multiple paths, it cats each of them. So we can say cat urls.txt urls.txt to have each URL occur twice.
urls.txt
http://localhost:3000/foo/none-none
http://localhost:3000/foo/none-chunked
http://localhost:3000/foo/gzip-none
http://localhost:3000/foo/gzip-chunked
http://localhost:3000/foo/br-none
http://localhost:3000/foo/br-chunked
http://localhost:3000/ten-thousand-lines/none-none
http://localhost:3000/ten-thousand-lines/none-chunked
http://localhost:3000/ten-thousand-lines/gzip-none
http://localhost:3000/ten-thousand-lines/gzip-chunked
http://localhost:3000/ten-thousand-lines/br-none
http://localhost:3000/ten-thousand-lines/br-chunked
http://localhost:3000/404
server.js
Code
const http = require("http");
const zlib = require("zlib");
const assert = require("assert");
const g_state = {
num_requests_by_path: {}, // See respond_ce_te
};
const FOO = Buffer.from("Foo\n");
const FOO_GZIP = zlib.gzipSync(FOO);
const FOO_BR = zlib.brotliCompressSync(FOO);
const MESSAGE_5XX = Buffer.from("Something went wrong\n");
const MESSAGE_404 = Buffer.from("Not found\n");
const TEN_THOUSAND_LINES = (() => {
// Note: "<br />\n" is in the Brotli dictionary
const line = Buffer.from("<br />\n");
const lines = [];
for (let i = 0; i < 10_000; i++) {
lines.push(line);
}
return Buffer.concat(lines);
})();
const TEN_THOUSAND_LINES_GZIP = zlib.gzipSync(TEN_THOUSAND_LINES);
const TEN_THOUSAND_LINES_BR = zlib.brotliCompressSync(TEN_THOUSAND_LINES);
assert.strictEqual(FOO_GZIP.toString("hex"), "1f8b080000000000001373cbcfe7020096ca00de04000000");
assert.strictEqual(FOO_BR.toString("hex"), "8b0180466f6f0a03");
assert.strictEqual(TEN_THOUSAND_LINES_GZIP.toString("hex"), "1f8b0800000000000013edc521110040080030ff2968f00538c21081fe8212c8cd2c7be2d74b499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499274d702cd2bb30a70110100");
assert.strictEqual(TEN_THOUSAND_LINES_BR.toString("hex"), "5b6f11815f02a00a39ac1270c675242100");
const ROUTES = {
"/foo/none-none": (req, res) => { respond_ce_te(req, res, 200, FOO, null, null); },
"/foo/none-chunked": (req, res) => { respond_ce_te(req, res, 200, FOO, null, "chunked"); },
"/foo/gzip-none": (req, res) => { respond_ce_te(req, res, 200, FOO_GZIP, "gzip", null); },
"/foo/gzip-chunked": (req, res) => { respond_ce_te(req, res, 200, FOO_GZIP, "gzip", "chunked"); },
"/foo/br-none": (req, res) => { respond_ce_te(req, res, 200, FOO_BR, "br", null); },
"/foo/br-chunked": (req, res) => { respond_ce_te(req, res, 200, FOO_BR, "br", "chunked"); },
"/ten-thousand-lines/none-none": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES, null, null); },
"/ten-thousand-lines/none-chunked": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES, null, "chunked"); },
"/ten-thousand-lines/gzip-none": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_GZIP, "gzip", null); },
"/ten-thousand-lines/gzip-chunked": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_GZIP, "gzip", "chunked"); },
"/ten-thousand-lines/br-none": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_BR, "br", null); },
"/ten-thousand-lines/br-chunked": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_BR, "br", "chunked"); },
}
function respond_ce_te(req, res, status, body, ce, te) {
assert.ok(Buffer.isBuffer(body));
// 500 for the first time each path is requested
counters = g_state.num_requests_by_path;
if (!counters[req.url]) {
counters[req.url] = 0;
}
if (counters[req.url] == 0) {
res.writeHead(500, {
"Content-Type": "text/plain",
"Content-Length": ("" + MESSAGE_5XX.length),
});
res.end(MESSAGE_5XX);
counters[req.url] += 1;
return;
}
counters[req.url] += 1;
// Response head
const headers = { "Content-Type": "text/plain" };
if (ce) {
headers["Content-Encoding"] = ce;
}
if (te) {
headers["Transfer-Encoding"] = te;
} else {
headers["Content-Length"] = "" + body.length;
}
res.writeHead(status, headers);
// Response body
if (te === "chunked") {
res.write(body.slice(0, body.length - 3));
res.end(body.slice(body.length - 3, body.length));
} else {
res.end(body);
}
}
const server = http.createServer((req, res) => {
const f = ROUTES[req.url] || ((req, res) => {
respond_ce_te(req, res, 404, MESSAGE_404, null, null);
});
f(req, res);
});
const port = 3000;
const host = "127.0.0.1";
server.listen(port, host, () => {
console.log(`Listening on http://${host}:${port}/`);
});