diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8e9b1b92..8301dead 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ foursight Change Log ---------- +4.2.2 +===== + +* modification of the biorxiv update check to squash a bug +* if a doi is misformatted or contains an unwanted v# in it they are reported + +`PR 560: Fix for biorxiv version update check bug `_ + 4.2.1 ===== diff --git a/chalicelib_fourfront/checks/wrangler_checks.py b/chalicelib_fourfront/checks/wrangler_checks.py index e15aef8d..a9370776 100644 --- a/chalicelib_fourfront/checks/wrangler_checks.py +++ b/chalicelib_fourfront/checks/wrangler_checks.py @@ -501,6 +501,21 @@ def add_pub_and_replace_biorxiv(connection, **kwargs): return action +def _remove_prefix_and_version_suffix_from_doi(pub_doi_id): + misformatted = False + try: + doi = pub_doi_id.split(':', 1)[1] + except Exception: + misformatted = True + return None, misformatted + pattern = re.compile(r'v\d+\s*$') + match = pattern.search(doi) + if match: + misformatted = True + doi = doi[:match.start()] + return doi, misformatted + + @check_function(action="reindex_biorxiv") def biorxiv_version_update(connection, **kwargs): '''Check if current bioRxiv Publications (not yet replaced with PubmedID) @@ -512,12 +527,18 @@ def biorxiv_version_update(connection, **kwargs): current_biorxivs = ff_utils.search_metadata(query, key=connection.ff_keys) items_to_update = [] + problem_ids = [] biorxiv_api = 'https://api.biorxiv.org/details/biorxiv/' for publication in current_biorxivs: - if not publication['ID'].startswith('doi:'): + pubid = publication.get('ID') + if not pubid.startswith('doi:'): + continue + doi, misformatted = _remove_prefix_and_version_suffix_from_doi(pubid) + if misformatted: + problem_ids.append(pubid) + if not doi: continue - doi = publication['ID'].split(':')[1] - for count in range(5): # try fetching data a few times + for _ in range(5): # try fetching data a few times r = requests.get(biorxiv_api + doi) if r.status_code == 200: break @@ -540,6 +561,15 @@ def biorxiv_version_update(connection, **kwargs): else: check.status = 'PASS' check.summary = check.description = 'All current bioRxiv Publications are up to date' + + if problem_ids: + check.status = 'WARN' + check.summary = check.summary + f"There are {len(problem_ids)} misformatted or problematic doi pub IDs" + prob_out = {'problem_ids': problem_ids} + if check.brief_output: + check.brief_output.append(prob_out) + else: + check.brief_output = [prob_out] return check diff --git a/pyproject.toml b/pyproject.toml index 945091e0..2f2698e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "foursight" -version = "4.2.1" +version = "4.2.2" description = "Serverless Chalice Application for Monitoring" authors = ["4DN-DCIC Team "] license = "MIT"