Define an index format
At least initially, let's use a simple JSON Lines file with one line per archive item.
Scenarios to handle
- Files in a
.zipor.7zarchive file - HTTP responses in a
.warcarchive file
Schema
type IndexRow = {
type: "file" | "response",
// If it's a file in a .zip / etc:
path?: str,
// If it's a response in a .warc:
url?: str,
response_at?: str, // e.g. "2024-01-01T00:00:00Z"
// The bytes of the item
content: {
len: int, // number of bytes
sha256: str, // lower-case hex
crc32: str, // lower-case hex of the same bytes that appear in a .zip file
},
// The byte range in the archive file that contains te(ce(bytes of the item))
// ...where ce is the Content-Encoding function, if any, e.g. `gzip`/`br`/`deflate`
// ...where te is the Transfer-Encoding, if any, e.g. `chunked`
content_encoded: {
offset: int, // number of bytes that are in the archive file before this range
len: int, // number of bytes in this range
content_encoding: None | "gzip" | "br",
transfer_encoding: None | "chunked",
},
total_header_len: int, // number of bytes between content_encoded.offset and the offset of the start of the PK34/WARCRecord/etc
archive_filename?: str, // e.g. "foo-20240101235959.warc"
// More information about the HTTP response (if relevant)
res?: {
status?: int,
location?: str | None,
content_type?: str | None,
content_disposition?: str | None,
},
}