From 714fc136f4c822c407fdc5473daa32ecffc725bb Mon Sep 17 00:00:00 2001 From: aschroed Date: Wed, 31 Jan 2024 15:25:29 -0500 Subject: [PATCH 1/3] fixed a bug due to an unexpected v# extension on the doi used as pub ID causing an exception --- CHANGELOG.rst | 7 ++++ .../checks/wrangler_checks.py | 36 +++++++++++++++++-- pyproject.toml | 2 +- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8e9b1b92..1a7e52c9 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ foursight Change Log ---------- +4.2.2 +===== + +* modification of the biorxiv update check to squash a bug +* if a doi is misformatted or contains an unwanted v# in it they are reported + + 4.2.1 ===== diff --git a/chalicelib_fourfront/checks/wrangler_checks.py b/chalicelib_fourfront/checks/wrangler_checks.py index e15aef8d..3abbc7b9 100644 --- a/chalicelib_fourfront/checks/wrangler_checks.py +++ b/chalicelib_fourfront/checks/wrangler_checks.py @@ -501,6 +501,21 @@ def add_pub_and_replace_biorxiv(connection, **kwargs): return action +def _remove_prefix_and_version_suffix_from_doi(pub_doi_id): + misformatted = False + try: + doi = pub_doi_id.split(':', 1)[1] + except Exception: + misformatted = True + return None, misformatted + pattern = re.compile(r'v\d+\s*$') + match = pattern.search(doi) + if match: + misformatted = True + doi = doi[:match.start()] + return doi, misformatted + + @check_function(action="reindex_biorxiv") def biorxiv_version_update(connection, **kwargs): '''Check if current bioRxiv Publications (not yet replaced with PubmedID) @@ -512,12 +527,18 @@ def biorxiv_version_update(connection, **kwargs): current_biorxivs = ff_utils.search_metadata(query, key=connection.ff_keys) items_to_update = [] + problem_ids = [] biorxiv_api = 'https://api.biorxiv.org/details/biorxiv/' for publication in current_biorxivs: - if not publication['ID'].startswith('doi:'): + pubid = publication.get('ID') + if not pubid.startswith('doi:'): + continue + doi, misformatted = _remove_prefix_and_version_suffix_from_doi(publication['ID']) + if misformatted: + problem_ids.append(pubid) + if not doi: continue - doi = publication['ID'].split(':')[1] - for count in range(5): # try fetching data a few times + for _ in range(5): # try fetching data a few times r = requests.get(biorxiv_api + doi) if r.status_code == 200: break @@ -540,6 +561,15 @@ def biorxiv_version_update(connection, **kwargs): else: check.status = 'PASS' check.summary = check.description = 'All current bioRxiv Publications are up to date' + + if problem_ids: + check.status = 'WARN' + check.summary = check.summary + f"\nThere are {len(problem_ids)} misformatted or problematic doi pub IDs" + prob_out = {'problem_ids': problem_ids} + if check.brief_output: + check.brief_output.append(prob_out) + else: + check.brief_output = [prob_out] return check diff --git a/pyproject.toml b/pyproject.toml index 945091e0..2f2698e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "foursight" -version = "4.2.1" +version = "4.2.2" description = "Serverless Chalice Application for Monitoring" authors = ["4DN-DCIC Team "] license = "MIT" From d0b4b08e7fff250c8dd78ccce848095a3f8f806c Mon Sep 17 00:00:00 2001 From: aschroed Date: Wed, 31 Jan 2024 15:32:27 -0500 Subject: [PATCH 2/3] added PR to changelog --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1a7e52c9..8301dead 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -13,6 +13,7 @@ Change Log * modification of the biorxiv update check to squash a bug * if a doi is misformatted or contains an unwanted v# in it they are reported +`PR 560: Fix for biorxiv version update check bug `_ 4.2.1 ===== From ef62c6cc9f9cbc75ffed88fdbc8ce26271e0618e Mon Sep 17 00:00:00 2001 From: aschroed Date: Fri, 2 Feb 2024 14:29:22 -0500 Subject: [PATCH 3/3] couple small fixes --- chalicelib_fourfront/checks/wrangler_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chalicelib_fourfront/checks/wrangler_checks.py b/chalicelib_fourfront/checks/wrangler_checks.py index 3abbc7b9..a9370776 100644 --- a/chalicelib_fourfront/checks/wrangler_checks.py +++ b/chalicelib_fourfront/checks/wrangler_checks.py @@ -533,7 +533,7 @@ def biorxiv_version_update(connection, **kwargs): pubid = publication.get('ID') if not pubid.startswith('doi:'): continue - doi, misformatted = _remove_prefix_and_version_suffix_from_doi(publication['ID']) + doi, misformatted = _remove_prefix_and_version_suffix_from_doi(pubid) if misformatted: problem_ids.append(pubid) if not doi: @@ -564,7 +564,7 @@ def biorxiv_version_update(connection, **kwargs): if problem_ids: check.status = 'WARN' - check.summary = check.summary + f"\nThere are {len(problem_ids)} misformatted or problematic doi pub IDs" + check.summary = check.summary + f"There are {len(problem_ids)} misformatted or problematic doi pub IDs" prob_out = {'problem_ids': problem_ids} if check.brief_output: check.brief_output.append(prob_out)