attardi/wikiextractor

How to store a document in a separate txt file instead of a single txt file containing multiple documents

Opened this issue · 1 comments

The default execution result is that a txt file contains multiple documents. Now, I want a txt file to only contain one document. What should I do?

I wrote a simple script to do this:

import os
import shutil

# Prefix to only include certain files
FILE_NAME_PREFIX = "wiki_" 

# The path to the directory containing the extracted results
START_DIR = "/home/ubuntu/datasets/text" 

# The directory to save the results per document 
DOCS_SAVE_PATH = "/home/ubuntu/datasets/utf8_wikipedia_data"

def get_all_files():
    all_files = []
    for dir_name in os.listdir(START_DIR):
        dir_path = os.path.join(START_DIR, dir_name)
        for file_name in os.listdir(dir_path):
            if FILE_NAME_PREFIX not in file_name or file_name[0] == '.':
                continue

            # Record the file_path
            file_path = os.path.join(dir_path, file_name)
            all_files.append(file_path)

    return all_files

RECORD_END_MARKER = "</doc>"
def extract_records_from_docs(doc_path):
    with open(doc_path, 'r') as reader:
        lines = reader.readlines()
    
    curr_idx = 0
    max_txt_len = 0
    while curr_idx < len(lines):
        # Get the current document id
        curr_doc_line = lines[curr_idx].strip()
        doc_line_parts = curr_doc_line.split(" ")
        doc_id_str = doc_line_parts[1].split("=")[1]
        doc_id = doc_id_str[1 : -1]
        
        # Get the current document lines
        doc_lines = []
        curr_idx += 1
        while RECORD_END_MARKER not in lines[curr_idx]:
            curr_line = lines[curr_idx].strip()
            if len(curr_line) > 0:
                doc_lines.append(curr_line)
            curr_idx += 1

        # Determine the text to write
        if len(doc_lines) > 1:
            doc_lines.pop(0)
            txt_to_write = "\n".join(doc_lines)
            max_txt_len = max(max_txt_len, len(txt_to_write))
            save_path = os.path.join(DOCS_SAVE_PATH, doc_id + ".txt")
            with open(save_path, 'w+') as writer:
                writer.write(txt_to_write)
        
        # Increment to the next record
        curr_idx += 1
    
    return max_txt_len

def main():
    # Create the save directory
    if os.path.exists(DOCS_SAVE_PATH):
        shutil.rmtree(DOCS_SAVE_PATH)
    os.makedirs(DOCS_SAVE_PATH, exist_ok = True)

    # Get the result per file
    all_files = get_all_files()
    max_overall_len = 0
    for file_path in all_files:
        max_len = extract_records_from_docs(file_path)
        print("Got max len of ", max_len, "for file", file_path)
        max_overall_len = max(max_overall_len, max_len)
    
    print("Got maximum txt length of", max_overall_len)

if __name__ == "__main__":
    main()