From 78f57e3b8a058776cbcf43239ba2fe4b5d223db5 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Tue, 8 Oct 2024 15:30:26 -0400
Subject: [PATCH 1/3] updated opf status mismatch check to filter on tag;
 modified bed file search for bed2beddb to respect skip_processing tag if
 present

---
 CHANGELOG.rst                               |  9 +++++++
 chalicelib_fourfront/checks/audit_checks.py | 29 ++++++++++++++++-----
 chalicelib_fourfront/checks/wfr_checks.py   |  3 ++-
 pyproject.toml                              |  2 +-
 4 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index d515778c..c5cd5840 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -8,6 +8,15 @@ Change Log
 ----------
 
 
+4.9.5
+=====
+
+`PR 583: Update opf status mismatch check <https://github.com/4dn-dcic/foursight/pull/583>`_
+
+* add a filter to filter on 'ignore_status_mismatch' tag on items (opfs, quality metrics or higlass_viewconfs) to ignore in opf status mismatch
+* small update to search for bed2beddb files to respect the 'skip_processing' tag if present
+
+
 4.9.4
 =====
 
diff --git a/chalicelib_fourfront/checks/audit_checks.py b/chalicelib_fourfront/checks/audit_checks.py
index b0dd1916..e56a78aa 100644
--- a/chalicelib_fourfront/checks/audit_checks.py
+++ b/chalicelib_fourfront/checks/audit_checks.py
@@ -734,6 +734,9 @@ def check_opf_status_mismatch(connection, **kwargs):
     '''
     check = CheckResult(connection, 'check_opf_status_mismatch')
 
+    # list of uuids to filter out as they have a tag to ignore them
+    tagged2ignore = get_items_with_ignore_tags(connection.ff_keys)
+
     opf_set = ('search/?type=ExperimentSet&other_processed_files.title%21=No+value&field=status'
                '&field=other_processed_files&field=experiments_in_set.other_processed_files')
     opf_exp = ('search/?type=ExperimentSet&other_processed_files.title=No+value'
@@ -741,34 +744,46 @@ def check_opf_status_mismatch(connection, **kwargs):
                '&field=experiments_in_set.other_processed_files&field=status')
     opf_set_results = ff_utils.search_metadata(opf_set, key=connection.ff_keys)
     opf_exp_results = ff_utils.search_metadata(opf_exp, key=connection.ff_keys)
-    results = opf_set_results + opf_exp_results
-    # extract file uuids
+    results = opf_set_results + opf_exp_results  # these are expset and expt items w/opfs
+    # extract all opf file and higlass viewconf uuids
     files = []
     for result in results:
         if result.get('other_processed_files'):
             for case in result['other_processed_files']:
-                files.extend([i['uuid'] for i in case['files']])
+                files.extend([i['uuid'] for i in case['files']]) # if i.get('uuid') not in tagged2ignore])
                 if case.get('higlass_view_config'):
+                    # if case['higlass_view_config'].get('uuid') not in tagged2ignore:
                     files.append(case['higlass_view_config'].get('uuid'))
         if result.get('experiments_in_set'):
             for exp in result['experiments_in_set']:
                 for case in exp['other_processed_files']:
-                    files.extend([i['uuid'] for i in case['files']])
-    # get metadata for files, to collect status
+                    files.extend([i['uuid'] for i in case['files']]) # if i.get('uuid') not in tagged2ignore])
+    
+    # get metadata for files, to collect status 
     resp = get_es_metadata(list(set(files)),
                            sources=['links.quality_metric', 'object.status', 'uuid'],
                            key=connection.ff_keys)
+    # key = opf uuid; value = status
     opf_status_dict = {item['uuid']: item['object']['status'] for item in resp if item['uuid'] in files}
+    
+    # key opf uuid; value = linked quality metric items
     opf_linked_dict = {
         item['uuid']: item.get('links', {}).get('quality_metric', []) for item in resp if item['uuid'] in files
     }
+
+    # quality metric uuids
     quality_metrics = [uuid for item in resp for uuid in item.get('links', {}).get('quality_metric', [])]
+
+    # get metadata for quality metrics (status)
     qm_resp = get_es_metadata(list(set(quality_metrics)),
                               sources=['uuid', 'object.status'],
                               key=connection.ff_keys)
+
+    # key = qual met uuid; value = status
     opf_other_dict = {item['uuid']: item['object']['status'] for item in qm_resp if item not in files}
+
     check.full_output = {}
-    for result in results:
+    for result in results:  # now go through each expset or experiment again and make sure all the statuses agree
         hg_dict = {item['title']: item.get('higlass_view_config', {}).get('uuid')
                    for item in result.get('other_processed_files', [])}
         titles = [item['title'] for item in result.get('other_processed_files', [])]
@@ -782,7 +797,7 @@ def check_opf_status_mismatch(connection, **kwargs):
             file_list.extend([item for exp in result.get('experiments_in_set', [])
                               for fileset in exp['other_processed_files']
                               for item in fileset['files'] if fileset['title'] == title])
-            statuses = set([opf_status_dict[f['uuid']] for f in file_list])
+            statuses = set([opf_status_dict[f['uuid']] for f in file_list if f.get('uuid') not in tagged2ignore])
             # import pdb; pdb.set_trace()
             if not statuses:
                 # to account for empty sections that may not yet contain files
diff --git a/chalicelib_fourfront/checks/wfr_checks.py b/chalicelib_fourfront/checks/wfr_checks.py
index d87d586a..bf1309db 100644
--- a/chalicelib_fourfront/checks/wfr_checks.py
+++ b/chalicelib_fourfront/checks/wfr_checks.py
@@ -674,7 +674,8 @@ def bed2beddb_status(connection, **kwargs):
                "&extra_files.file_format.display_title=beddb"
                "&extra_files.status=uploading"
                "&extra_files.status=to be uploaded by workflow"
-               "&status!=uploading&status!=to be uploaded by workflow")
+               "&status!=uploading&status!=to be uploaded by workflow"
+               "&tags!=skip_processing")
     # add date
     s_date = kwargs.get('start_date')
     if s_date:
diff --git a/pyproject.toml b/pyproject.toml
index a5ada40c..34fbe314 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "foursight"
-version = "4.9.4"
+version = "4.9.5"
 description = "Serverless Chalice Application for Monitoring"
 authors = ["4DN-DCIC Team <support@4dnucleome.org>"]
 license = "MIT"

From 17de0d92a64810a01722f04917bde8cb2abec9d3 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Tue, 8 Oct 2024 15:44:21 -0400
Subject: [PATCH 2/3] update first query on bed2beddb check

---
 chalicelib_fourfront/checks/wfr_checks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/chalicelib_fourfront/checks/wfr_checks.py b/chalicelib_fourfront/checks/wfr_checks.py
index bf1309db..331e3a6a 100644
--- a/chalicelib_fourfront/checks/wfr_checks.py
+++ b/chalicelib_fourfront/checks/wfr_checks.py
@@ -654,11 +654,11 @@ def bed2beddb_status(connection, **kwargs):
     check, skip = wfr_utils.check_indexing(check, connection)
     if skip:
         return check
-    # Build the query (find bg files without bw files)
+    # Build the query (find bed files without beddb files)
     query = ("/search/?type=File&file_format.file_format=bed"
              "&extra_files.file_format.display_title!=beddb"
              "&status!=uploading&status!=to be uploaded by workflow"
-             "&status!=archived&status!=archived to project")
+             "&status!=archived&status!=archived to project&tags!=skip_processing")
     query += "".join(["&file_type=" + i for i in accepted_types])
     # add date
     s_date = kwargs.get('start_date')

From 56d310aa1dd02e7415a9c4160dfd6b043599d843 Mon Sep 17 00:00:00 2001
From: aschroed <andrew_schroeder@hms.harvard.edu>
Date: Tue, 8 Oct 2024 16:26:25 -0400
Subject: [PATCH 3/3] removed commented out pdb

---
 chalicelib_fourfront/checks/audit_checks.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/chalicelib_fourfront/checks/audit_checks.py b/chalicelib_fourfront/checks/audit_checks.py
index e56a78aa..0b1f096b 100644
--- a/chalicelib_fourfront/checks/audit_checks.py
+++ b/chalicelib_fourfront/checks/audit_checks.py
@@ -798,7 +798,6 @@ def check_opf_status_mismatch(connection, **kwargs):
                               for fileset in exp['other_processed_files']
                               for item in fileset['files'] if fileset['title'] == title])
             statuses = set([opf_status_dict[f['uuid']] for f in file_list if f.get('uuid') not in tagged2ignore])
-            # import pdb; pdb.set_trace()
             if not statuses:
                 # to account for empty sections that may not yet contain files
                 pass