Create sample WARC files

Result:

ITS-WARC-EXAMPLE-WGET-20240304204943.warc

Steps:

Create a web server with sample content
Create a .warc via GNU Wget

Response bodies#

Context	Response body
*`/foo/`**	`"Foo\n"` (4 bytes)
*`/ten-thousand-lines/`**	`"<br />\n" * 10e3` (70,000 bytes)
404 page	`"Not found\n"` (10 bytes)
500 page	`"Something went wrong\n"` (21 bytes)

Routes with various encoding combinations#

Content-Encoding : none, gzip, or br
Transfer-Encoding : none, or chunked

Path	CE	TE
`/foo/none-none`
`/foo/none-chunked`		`chunked`
`/foo/gzip-none`	`gzip`
`/foo/gzip-chunked`	`gzip`	`chunked`
`/foo/br-none`	`br`
`/foo/br-chunked`	`br`	`chunked`
`/ten-thousand-lines/none-none`
`/ten-thousand-lines/none-chunked`		`chunked`
`/ten-thousand-lines/gzip-none`	`gzip`
`/ten-thousand-lines/gzip-chunked`	`gzip`	`chunked`
`/ten-thousand-lines/br-none`	`br`
`/ten-thousand-lines/br-chunked`	`br`	`chunked`
`/404`

The first time each path is requested, serve the 500 page#

For each path, let's have the the sample server respond with the 500 page the first time that you go to that path:

...
if (counters[req.url] == 0) {
    res.writeHead(500, {
        ...

This will let us easily have both 500 and non-500 responses for HTTP requests in the .warc file.

Create a WARC file via GNU Wget#

Let's get each URL twice (once for the 500, then once again):

cat urls.txt urls.txt | \
    wget \
        --warc-file ./ITS-WARC-EXAMPLE-WGET-`node -e 'process.stdout.write((new Date()).toISOString().substring(0, 19).replace(/[T:-]/g, ""))'` \
        --no-warc-compression \
        --wait=1.0 \
        -O /dev/null \
        -i -

When you invoke cat with multiple paths, it cats each of them. So we can say cat urls.txt urls.txt to have each URL occur twice.

urls.txt#

http://localhost:3000/foo/none-none
http://localhost:3000/foo/none-chunked
http://localhost:3000/foo/gzip-none
http://localhost:3000/foo/gzip-chunked
http://localhost:3000/foo/br-none
http://localhost:3000/foo/br-chunked
http://localhost:3000/ten-thousand-lines/none-none
http://localhost:3000/ten-thousand-lines/none-chunked
http://localhost:3000/ten-thousand-lines/gzip-none
http://localhost:3000/ten-thousand-lines/gzip-chunked
http://localhost:3000/ten-thousand-lines/br-none
http://localhost:3000/ten-thousand-lines/br-chunked
http://localhost:3000/404

server.js#

Code

const http = require("http");
const zlib = require("zlib");
const assert = require("assert");

const g_state = {
    num_requests_by_path: {}, // See respond_ce_te
};

const FOO = Buffer.from("Foo\n");
const FOO_GZIP = zlib.gzipSync(FOO);
const FOO_BR = zlib.brotliCompressSync(FOO);
const MESSAGE_5XX = Buffer.from("Something went wrong\n");
const MESSAGE_404 = Buffer.from("Not found\n");

const TEN_THOUSAND_LINES = (() => {
    // Note: "<br />\n" is in the Brotli dictionary
    const line = Buffer.from("<br />\n");
    const lines = [];
    for (let i = 0; i < 10_000; i++) {
        lines.push(line);
    }
    return Buffer.concat(lines);
})();
const TEN_THOUSAND_LINES_GZIP = zlib.gzipSync(TEN_THOUSAND_LINES);
const TEN_THOUSAND_LINES_BR = zlib.brotliCompressSync(TEN_THOUSAND_LINES);
assert.strictEqual(FOO_GZIP.toString("hex"), "1f8b080000000000001373cbcfe7020096ca00de04000000");
assert.strictEqual(FOO_BR.toString("hex"), "8b0180466f6f0a03");
assert.strictEqual(TEN_THOUSAND_LINES_GZIP.toString("hex"), "1f8b0800000000000013edc521110040080030ff2968f00538c21081fe8212c8cd2c7be2d74b499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499224499274d702cd2bb30a70110100");
assert.strictEqual(TEN_THOUSAND_LINES_BR.toString("hex"), "5b6f11815f02a00a39ac1270c675242100");

const ROUTES = {
    "/foo/none-none":       (req, res) => { respond_ce_te(req, res, 200, FOO, null, null); },
    "/foo/none-chunked":    (req, res) => { respond_ce_te(req, res, 200, FOO, null, "chunked"); },
    "/foo/gzip-none":       (req, res) => { respond_ce_te(req, res, 200, FOO_GZIP, "gzip", null); },
    "/foo/gzip-chunked":    (req, res) => { respond_ce_te(req, res, 200, FOO_GZIP, "gzip", "chunked"); },
    "/foo/br-none":         (req, res) => { respond_ce_te(req, res, 200, FOO_BR, "br", null); },
    "/foo/br-chunked":      (req, res) => { respond_ce_te(req, res, 200, FOO_BR, "br", "chunked"); },
    "/ten-thousand-lines/none-none":    (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES, null, null); },
    "/ten-thousand-lines/none-chunked": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES, null, "chunked"); },
    "/ten-thousand-lines/gzip-none":    (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_GZIP, "gzip", null); },
    "/ten-thousand-lines/gzip-chunked": (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_GZIP, "gzip", "chunked"); },
    "/ten-thousand-lines/br-none":      (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_BR, "br", null); },
    "/ten-thousand-lines/br-chunked":   (req, res) => { respond_ce_te(req, res, 200, TEN_THOUSAND_LINES_BR, "br", "chunked"); },
}

function respond_ce_te(req, res, status, body, ce, te) {
    assert.ok(Buffer.isBuffer(body));

    // 500 for the first time each path is requested
    counters = g_state.num_requests_by_path;
    if (!counters[req.url]) {
        counters[req.url] = 0;
    }
    if (counters[req.url] == 0) {
        res.writeHead(500, {
            "Content-Type": "text/plain",
            "Content-Length": ("" + MESSAGE_5XX.length),
        });
        res.end(MESSAGE_5XX);
        counters[req.url] += 1;
        return;
    }
    counters[req.url] += 1;

    // Response head
    const headers = { "Content-Type": "text/plain" };
    if (ce) {
        headers["Content-Encoding"] = ce;
    }
    if (te) {
        headers["Transfer-Encoding"] = te;
    } else {
        headers["Content-Length"] = "" + body.length;
    }
    res.writeHead(status, headers);

    // Response body
    if (te === "chunked") {
        res.write(body.slice(0, body.length - 3));
        res.end(body.slice(body.length - 3, body.length));
    } else {
        res.end(body);
    }
}

const server = http.createServer((req, res) => {
    const f = ROUTES[req.url] || ((req, res) => {
        respond_ce_te(req, res, 404, MESSAGE_404, null, null);
    });
    f(req, res);
});
const port = 3000;
const host = "127.0.0.1";
server.listen(port, host, () => {
    console.log(`Listening on http://${host}:${port}/`);
});