Define an index format

At least initially, let's use a simple JSON Lines file with one line per archive item.

Scenarios to handle#

Files in a .zip or .7z archive file
HTTP responses in a .warc archive file

Schema#

type IndexRow = {
    type: "file" | "response",

    // If it's a file in a .zip / etc:
    path?: str,

    // If it's a response in a .warc:
    url?: str,
    response_at?: str, // e.g. "2024-01-01T00:00:00Z"

    // The bytes of the item
    content: {
        len: int,    // number of bytes
        sha256: str, // lower-case hex
        crc32: str,  // lower-case hex of the same bytes that appear in a .zip file
    },

    // The byte range in the archive file that contains te(ce(bytes of the item))
    // ...where ce is the Content-Encoding function, if any, e.g. `gzip`/`br`/`deflate`
    // ...where te is the Transfer-Encoding, if any, e.g. `chunked`
    content_encoded: {
        offset: int,  // number of bytes that are in the archive file before this range
        len: int,     // number of bytes in this range
        content_encoding: None | "gzip" | "br",
        transfer_encoding: None | "chunked",
    },

    total_header_len: int, // number of bytes between content_encoded.offset and the offset of the start of the PK34/WARCRecord/etc
    archive_filename?: str, // e.g. "foo-20240101235959.warc"

    // More information about the HTTP response (if relevant)
    res?: {
        status?: int,
        location?: str | None,
        content_type?: str | None,
        content_disposition?: str | None,
    },
}