Abstract code for subject disambiguation into command line code

Question

Abstract code for subject disambiguation into command line code

kallewesterling opened this issue 2 years ago · 0 comments

Here's a quick sketch:

# Setup:
# - put username and password separated with a space in a file called "auth"
# - make sure panoptes_client, tqdm and requests are installed packages

from pathlib import Path
from panoptes_client import Panoptes, Project, SubjectSet
from tqdm.notebook import tqdm

import requests
import hashlib
import json
import time

def get_md5(path: str):
    """
    Computes the MD5 hash of a file.

    .. versionadded:: 0.1.0

    Parameters
    ----------
    path : str
        The path of the file to compute the MD5 hash for.

    Returns
    -------
    str
        The computed MD5 hash in hexadecimal format.

    Notes
    -----
    The function is borrowed from https://bit.ly/3TvUrd1.
    """
    md5_hash = hashlib.md5()

    with open(path, "rb") as f:
        content = f.read()
        md5_hash.update(content)

        digest = md5_hash.hexdigest()

    return digest

# Set up username and password
username, password = Path("./auth").read_text().split(" ")

# Project ID for LwM is 9943
project_id = 9943

# Connect API
Panoptes.connect(username=username, password=password)

# Set up Project
project = Project(project_id)

# Load subject sets + set up names
subject_set_ids = json.loads(Path("subject_sets.json").read_text())
subject_set_names = list(subject_set_ids.keys())

# Load in the done subject sets (so we don't double up)
done_subject_sets = json.loads(Path("done_subject_sets.json").read_text()) if Path("done_subject_sets.json").exists() else []

lst = [x for x in subject_set_names if x not in done_subject_sets] # and x in PROCESS_KEYS

for subject_set_name in lst: # tqdm(lst, position = 0, desc=subject_set_name):
    # print(f"Processing {subject_set_name}")
    subject_set_id = subject_set_ids[subject_set_name]
    subject_set = SubjectSet(subject_set_id)
    
    errors_occured = False
    
    for subject in tqdm(subject_set.subjects, position=1, total = subject_set.set_member_subjects_count, desc=subject_set_name): # leave=False, 
        if not "!zooniverse_file_md5" in subject.metadata.keys():
            # print(f"updating {subject.id}")
            urls = [url for x in subject.locations for url in x.values()]
            
            # Ensure we have only one URL
            if len(urls) > 1:
                raise NotImplementedError("This script has no ability to process multi-URL subjects yet.")
                
            if len(urls) == 0:
                print(f"--> Warning: subject {subject.id} had not URL!")
                continue
            
            # because we will only process subjects with one URL (see above)
            url = urls[0]
            
            filename = url.split('/')[-1]
            filepath = Path(f"downloads/{filename}")

            if not filepath.exists():
                filepath.parent.mkdir(parents=True, exist_ok=True)

                try:
                    r = requests.get(url, timeout=10)
                except:
                    errors_occured = True
                    time.sleep(50)
                    continue

                if r.status_code != 200:
                    raise RuntimeError(f"Failed with status {r.status_code}")

                filepath.write_bytes(r.content)

            md5 = get_md5(filepath)

            subject.metadata["!zooniverse_file_md5"] = md5
            subject.save()
    
    if not errors_occured:
        done_subject_sets.append(subject_set_name)
        Path("done_subject_sets.json").write_text(json.dumps(done_subject_sets))