Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved handling of user query for higlass items #564

Merged
merged 11 commits into from
Mar 21, 2024
8 changes: 8 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ foursight
Change Log
----------

4.4.3
=====
* Add helper to convert user input str to list for select queries in higlass_checks.py
* Adjust output of check_validation_errors check to list affected items by type in full_output if not too many
* update lock file to use foursight-core with bug fix for local-check-execution script

`PR 564: Improved handling of user query for higlass items <https://github.com/4dn-dcic/foursight/pull/564>`_

4.4.2
=====
* Added 'input_bed' to attr_keys in wfr_utils.py's start_missing_run for ATAC-seq pipeline
Expand Down
13 changes: 10 additions & 3 deletions chalicelib_fourfront/checks/audit_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,17 +781,24 @@ def check_validation_errors(connection, **kwargs):
returns link to search if found.
'''
check = CheckResult(connection, 'check_validation_errors')

search_url = 'search/?validation_errors.name!=No+value&type=Item'
results = ff_utils.search_metadata(search_url + '&field=@id', key=connection.ff_keys)
if results:
types = {item for result in results for item in result['@type'] if item != 'Item'}
ids_by_type = {}
for result in results:
ids_by_type.setdefault(result.get('@type')[0], []).append(result.get('@id'))
check.status = 'WARN'
check.summary = 'Validation errors found'
check.description = ('{} items found with validation errors, comprising the following '
'item types: {}. \nFor search results see link below.'.format(
len(results), ', '.join(list(types))))
len(results), ', '.join(ids_by_type.keys())))
check.ff_link = connection.ff_server + search_url
# too many items of a type suggests a possibly general issue for that type
for ty, item_ids in ids_by_type.items():
if len(item_ids) > 100:
ids_by_type[ty] = 'Many items of this type have validation errors'
check.full_output = ids_by_type
else:
check.status = 'PASS'
check.summary = 'No validation errors'
Expand Down
93 changes: 70 additions & 23 deletions chalicelib_fourfront/checks/higlass_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,12 +354,7 @@ def find_files_requiring_higlass_items(connection, check_name, action_name, sear
check.queries = []
check.action = action_name

# If no search query was provided, fail
if not search_queries:
check.summary = check.description = "No search query provided, nothing to update."
check.status = 'PASS'
check.allow_action = False
return check
search_queries = verify_queries(check, search_queries)

# Add the fields we want to return.
fields_to_include = '&field=' + '&field='.join((
Expand All @@ -385,7 +380,7 @@ def find_files_requiring_higlass_items(connection, check_name, action_name, sear
file_search_query = "/search/?type=File&higlass_uid!=No+value&genome_assembly!=No+value" + query + fields_to_include

# Query the files
search_res = ff_utils.search_metadata(file_search_query, key=connection.ff_keys)
search_res = get_search_results(connection, check, file_search_query)

# Collate the results into a dict of ExpSets, ordered by accession
for found_file in search_res:
Expand Down Expand Up @@ -430,11 +425,20 @@ def find_files_requiring_higlass_items(connection, check_name, action_name, sear
check.full_output[full_output_key] = {}
check.full_output[full_output_key][ga] = target_files_by_ga[ga]

if check.full_output.get('search_problems'):
check.summary = "Please check input query for typo or formatting error"
check.description = check.summary + ". See full_output for details."
check.allow_action = False
check.status = 'FAIL'
return check

if not target_files_by_ga:
# nothing new to generate
check.summary = check.description = "No new Higlass Items to generate"
check.allow_action = False
check.status = 'PASS'
return check

else:
all_files = sum([len(x) for x in check.full_output["ready"].values()])
check.summary = "Ready to generate %s Higlass Items" % all_files
Expand Down Expand Up @@ -827,12 +831,7 @@ def find_expsets_processedfiles_requiring_higlass_items(connection, check_name,
"static_content",
])

# If no search query was provided, fail
if not search_queries:
check.summary = check.description = "No search query provided, nothing to update."
check.status = 'PASS'
check.allow_action = False
return check
search_queries = verify_queries(check, search_queries)

expsets_by_accession = {}
# Use all of the search queries to make a list of the ExpSets we will work on.
Expand All @@ -844,7 +843,7 @@ def find_expsets_processedfiles_requiring_higlass_items(connection, check_name,
processed_expsets_query = "/search/?type=ExperimentSetReplicate" + query + fields_to_include

# Query the Experiment Sets
search_res = ff_utils.search_metadata(processed_expsets_query, key=connection.ff_keys)
search_res = get_search_results(connection, check, processed_expsets_query)

# Collate the results into a dict of ExpSets, ordered by accession
for expset in search_res:
Expand Down Expand Up @@ -897,6 +896,14 @@ def find_expsets_processedfiles_requiring_higlass_items(connection, check_name,
ready_to_generate_count = sum([len(accessions) for x, accessions in check.full_output["ready_expsets"].items()])

check.summary = ""

if check.full_output.get('search_problems'):
check.summary = "Please check input query for typo or formatting error"
check.description = check.summary + ". See full_output for details."
check.allow_action = False
check.status = 'FAIL'
return check

# If there are no files to act upon, we're done.
if not target_files_by_ga:
check.summary = check.description = "No new view configs to generate"
Expand Down Expand Up @@ -1177,19 +1184,15 @@ def find_expsets_otherprocessedfiles_requiring_higlass_items(connection, check_n
check.queries = []
check.action = action_name

# If no search query was provided and find_opfs_missing_higlass is False, pass with no results
if not (search_queries or find_opfs_missing_higlass):
check.summary = check.description = "No search query provided, nothing to update."
check.status = 'PASS'
check.allow_action = False
return check

if find_opfs_missing_higlass:
search_queries = [
"&experiments_in_set.other_processed_files.files.higlass_uid%21=No+value",
"&other_processed_files.files.higlass_uid%21=No+value"
]

else:
search_queries = verify_queries(check, search_queries)

# get the fields you need to include
fields_to_include = ""
for new_field in (
Expand All @@ -1216,7 +1219,7 @@ def find_expsets_otherprocessedfiles_requiring_higlass_items(connection, check_n
expset_query = "/search/?type=ExperimentSetReplicate" + query + fields_to_include

# Store results by accession
search_res = ff_utils.search_metadata(expset_query, key=connection.ff_keys)
search_res = get_search_results(connection, check, expset_query)
for expset in search_res:
expsets_by_accession[ expset["accession"] ] = expset

Expand Down Expand Up @@ -1368,6 +1371,13 @@ def consider_filegroup(fg):
all_filegroups_to_update[accession] = filegroups_to_update
higlass_view_count += len(filegroups_to_update.keys())

if check.full_output.get('search_problems'):
check.summary = "Please check input query for typo or formatting error"
check.description = check.summary + ". See full_output for details."
check.allow_action = False
check.status = 'FAIL'
return check

# check announces success
check.full_output['filegroups_to_update'] = all_filegroups_to_update
check.full_output['expsets_to_update'] = expsets_to_update
Expand Down Expand Up @@ -1697,7 +1707,7 @@ def does_url_exist(path):
continue

# Query all possible files
possibly_reg = ff_utils.search_metadata(search_query, key=connection.ff_keys)
possibly_reg = get_search_results(connection, check, search_query)

for procfile in possibly_reg:
current_file_cat = file_cat
Expand Down Expand Up @@ -2287,3 +2297,40 @@ def convert_es_timestamp_to_datetime(raw):
"%Y-%m-%dT%H:%M:%S"
)
return converted_date

def verify_queries(check, search_queries):
"""
Helper to check that a search query if properly formatted and reformat if necessary

Args:
check(CheckResult): Result of check, to be passed from check.
search_queries(list or string): A list of search queries. All Files found in at least one of the queries will be modified.

Returns:
Formatted search_queries (list)
"""
# If no search query was provided, pass with no results
if not (search_queries):
check.summary = check.description = "No search query provided, nothing to update."
check.status = 'PASS'
check.allow_action = False

if isinstance(search_queries, str):
# for case where (possibly multiple) query is passed in via kwargs
queries = search_queries.split(',')
search_queries = [q.strip() for q in queries]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it might also be useful to check if the passed query(s) begin with an & and if not add it.

check.brief_output = {
"corrected_query": "The query was not formatted as a list, please double check results"
}

# add '&' when missing from str
search_queries = [q if q[0] == '&' else '&' + q for q in search_queries]

return search_queries

def get_search_results(connection, check, search_query):
try:
return ff_utils.search_metadata(search_query, key=connection.ff_keys)
except Exception as e:
check.full_output.setdefault('search_problems', []).append(str(e))
return []
Loading
Loading