pisa-engine/pisa

Schema files to organize files

elshize opened this issue · 0 comments

Below is a rough draft of a schema/config/meta file (idk what name fits best here) to organize the files together.

The primary goal is to have sane defaults such that if one uses PISA to parse, invert, compress, query, etc., they don't need to provide all that information that currenlty is required at command line, but rather just point at the meta file that has all the info.

This is just a draft, and intentionally ignores shards right now.

@JMMackenzie @amallia please let me know what you think.

# Note that most (if not all) settings can be overridden with CLI
# arguments at different stages, but all have defaults for simplicity.

[collection]
# Stemmer is defined at the time of collection parsing.
# Typically, it should not be changed later on.
stemmer = "porter2"
# Path to forward index (binary collection format)
forward-index = "cw09b.fwd"
# Path to terms file (new-line delimited text file)
terms = "cw09b.terms"
# Path to term lexicon file
terms_lexicon = "cw09b.termlex"
# Path to documents file (new-line delimited text file)
documents = "cw09b.documents"
# Path to document lexicon file
document_lexicon = "cw09b.doclex"
# Path to URL file (new-line delimited text file)
urls = "cw09b.urls"

[collection.stats]
terms = 100
documents = 100
total_document_length = 100000
avg_document_length = 100
# There's probably something I'm forgetting...

[inverted_collection]
documents = "cw09b.docs"
frequencies = "cw09b.freqs"
sizes = "cw09b.sizes"

[[inverted_index]]
encoding = "block_simdbp"
path = "cw09b.block_simdbp"

[[inverted_index]]
encoding = "pef"
ordering = "url"
path = "cw09b.pef.url"

[[score_metadata]]  # "WAND data"
compressed = false
path = "cw09b.bm25.fixed"
type = "fixed"
block_size = "128"

[[score_metadata]]
compressed = false
path = "cw09b.bm25.var"
type = "variable"
lambda = "16"