Abstract code for subject disambiguation into command line code
kallewesterling opened this issue · 0 comments
kallewesterling commented
Here's a quick sketch:
# Setup:
# - put username and password separated with a space in a file called "auth"
# - make sure panoptes_client, tqdm and requests are installed packages
from pathlib import Path
from panoptes_client import Panoptes, Project, SubjectSet
from tqdm.notebook import tqdm
import requests
import hashlib
import json
import time
def get_md5(path: str):
"""
Computes the MD5 hash of a file.
.. versionadded:: 0.1.0
Parameters
----------
path : str
The path of the file to compute the MD5 hash for.
Returns
-------
str
The computed MD5 hash in hexadecimal format.
Notes
-----
The function is borrowed from https://bit.ly/3TvUrd1.
"""
md5_hash = hashlib.md5()
with open(path, "rb") as f:
content = f.read()
md5_hash.update(content)
digest = md5_hash.hexdigest()
return digest
# Set up username and password
username, password = Path("./auth").read_text().split(" ")
# Project ID for LwM is 9943
project_id = 9943
# Connect API
Panoptes.connect(username=username, password=password)
# Set up Project
project = Project(project_id)
# Load subject sets + set up names
subject_set_ids = json.loads(Path("subject_sets.json").read_text())
subject_set_names = list(subject_set_ids.keys())
# Load in the done subject sets (so we don't double up)
done_subject_sets = json.loads(Path("done_subject_sets.json").read_text()) if Path("done_subject_sets.json").exists() else []
lst = [x for x in subject_set_names if x not in done_subject_sets] # and x in PROCESS_KEYS
for subject_set_name in lst: # tqdm(lst, position = 0, desc=subject_set_name):
# print(f"Processing {subject_set_name}")
subject_set_id = subject_set_ids[subject_set_name]
subject_set = SubjectSet(subject_set_id)
errors_occured = False
for subject in tqdm(subject_set.subjects, position=1, total = subject_set.set_member_subjects_count, desc=subject_set_name): # leave=False,
if not "!zooniverse_file_md5" in subject.metadata.keys():
# print(f"updating {subject.id}")
urls = [url for x in subject.locations for url in x.values()]
# Ensure we have only one URL
if len(urls) > 1:
raise NotImplementedError("This script has no ability to process multi-URL subjects yet.")
if len(urls) == 0:
print(f"--> Warning: subject {subject.id} had not URL!")
continue
# because we will only process subjects with one URL (see above)
url = urls[0]
filename = url.split('/')[-1]
filepath = Path(f"downloads/{filename}")
if not filepath.exists():
filepath.parent.mkdir(parents=True, exist_ok=True)
try:
r = requests.get(url, timeout=10)
except:
errors_occured = True
time.sleep(50)
continue
if r.status_code != 200:
raise RuntimeError(f"Failed with status {r.status_code}")
filepath.write_bytes(r.content)
md5 = get_md5(filepath)
subject.metadata["!zooniverse_file_md5"] = md5
subject.save()
if not errors_occured:
done_subject_sets.append(subject_set_name)
Path("done_subject_sets.json").write_text(json.dumps(done_subject_sets))