Imports from Wayback list the final redirect URL twice
Closed this issue · 0 comments
Mr0grog commented
When we import versions from Wayback to web-monitoring-db using the script in web_monitoring/cli/cli.py
, we are listing the final URL in a redirect chain twice. See this version for an example: https://api-staging.monitoring.envirodatagov.org/api/v0/versions/47951a31-ff31-44b2-b910-3c41278a9399
This is probably a bug in format_memento()
:
web-monitoring-processing/web_monitoring/cli/cli.py
Lines 290 to 339 in 3f731b6
def format_memento(self, memento, cdx_record, maintainers, tags): | |
""" | |
Format a Wayback Memento response as a dict with import-ready info. | |
""" | |
iso_date = cdx_record.timestamp.isoformat() | |
if cdx_record.timestamp.tzinfo is None: | |
iso_date += 'Z' | |
# Get all headers from the original response. | |
prefix = 'X-Archive-Orig-' | |
original_headers = { | |
k[len(prefix):]: v for k, v in memento.headers.items() | |
if k.startswith(prefix) | |
} | |
metadata = { | |
'mime_type': memento.headers.get('content-type', '').split(';', 1)[0], | |
'encoding': memento.encoding, | |
'headers': original_headers, | |
'view_url': cdx_record.view_url | |
} | |
if memento.status_code >= 400: | |
metadata['error_code'] = memento.status_code | |
# If there were redirects, list every URL in the chain of requests. | |
if memento.url != cdx_record.raw_url: | |
redirects = list(map( | |
lambda response: wayback.memento_url_data(response.url)[0], | |
memento.history)) | |
redirected_url = wayback.memento_url_data(memento.url)[0] | |
redirects.append(redirected_url) | |
metadata['redirected_url'] = redirected_url | |
metadata['redirects'] = redirects | |
return dict( | |
# Page-level info | |
page_url=cdx_record.url, | |
page_maintainers=maintainers, | |
page_tags=tags, | |
title=utils.extract_title(memento.content), | |
# Version/memento-level info | |
capture_time=iso_date, | |
uri=cdx_record.raw_url, | |
version_hash=utils.hash_content(memento.content), | |
source_type='internet_archive', | |
source_metadata=metadata, | |
status=memento.status_code | |
) |