Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update code for Python 3 (Python 2.7 no longer maintained after January 1, 2020) #20

Open
bdklahn opened this issue Aug 6, 2019 · 2 comments

Comments

@bdklahn
Copy link

bdklahn commented Aug 6, 2019

https://pythonclock.org/

print function
xrange to range
many things now return iterators/generators vs. list objects.
etc., etc.

2to3 script can help

@bdklahn
Copy link
Author

bdklahn commented Aug 6, 2019

output of 2to3-3.5:

--- blast.py    (original)
+++ blast.py    (refactored)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import click as ck
 import pandas as pd
 import numpy as np
@@ -104,7 +104,7 @@
     length = 60
     n = len(sequence)
     res = ''
-    for i in xrange(0, n, length):
+    for i in range(0, n, length):
         res += sequence[i: i + length] + '\n'
     return res

--- cafa.py     (original)
+++ cafa.py     (refactored)
@@ -170,8 +170,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
     gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))

     with open('data/eshark/targets.txt') as f:
         for line in f:
@@ -185,7 +185,7 @@
                 else:
                     proteins.append('')
                 grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
-                for i in xrange(len(seq) - gram_len + 1):
+                for i in range(len(seq) - gram_len + 1):
                     grams[i] = vocab[seq[i: (i + gram_len)]]
                 ngrams.append(grams)

@@ -194,7 +194,7 @@
         'accessions': proteins,
         'ngrams': ngrams})

-    print(len(df))
+    print((len(df)))
     embed_df = pd.read_pickle('data/graph_new_embeddings.pkl')

     df = pd.merge(df, embed_df, on='accessions', how='left')
@@ -253,9 +253,9 @@
     df = pd.merge(targets, mf_preds, on='targets')
     df = pd.merge(df, cc_preds, on='targets')
     df = pd.merge(df, bp_preds, on='targets')
-    mf = map(str, mf_df['functions'].values)
-    cc = map(str, cc_df['functions'].values)
-    bp = map(str, bp_df['functions'].values)
+    mf = list(map(str, mf_df['functions'].values))
+    cc = list(map(str, cc_df['functions'].values))
+    bp = list(map(str, bp_df['functions'].values))
     taxons = set(df['orgs'].values)
     annots = get_real_annotations()
     for tax_id in taxons:
@@ -289,8 +289,8 @@
             f.write('AUTHOR CBRC_BORG\n')
             f.write('MODEL 3\n')
             f.write('KEYWORDS sequence properties, machine learning.\n')
-            for target_id, annots in results.iteritems():
-                for go_id, score in annots.iteritems():
+            for target_id, annots in results.items():
+                for go_id, score in annots.items():
                     sc = '%.2f' % score
                     f.write(target_id + '\t' + go_id + '\t' + sc + '\n')
             f.write('END\n')
@@ -308,7 +308,7 @@
         prot_id = row['proteins']
         if prot_id not in preds:
             preds[prot_id] = set()
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             if row['predictions'][i] == 1:
                 preds[prot_id].add(functions[i])
         if prot_id not in annots:
@@ -321,7 +321,7 @@
         prot_id = row['proteins']
         if prot_id not in preds:
             preds[prot_id] = set()
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             if row['predictions'][i] == 1:
                 preds[prot_id].add(functions[i])
         if prot_id not in annots:
@@ -334,7 +334,7 @@
         prot_id = row['proteins']
         if prot_id not in preds:
             preds[prot_id] = set()
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             if row['predictions'][i] == 1:
                 preds[prot_id].add(functions[i])
         if prot_id not in annots:
@@ -349,7 +349,7 @@
             anchestors.remove(go_id)
             go_set -= anchestors

-    proteins = sorted(annots.keys(), key=lambda x: (
+    proteins = sorted(list(annots.keys()), key=lambda x: (
         x.split('_')[1], x.split('_')[0]))
     with open(root + 'test_predictions.tab', 'w') as f:
         for prot_id in proteins:
@@ -427,7 +427,7 @@
     p = 0.0
     r = 0.0
     f = 0.0
-    for prot, pred_annots in preds.iteritems():
+    for prot, pred_annots in preds.items():
         real_annots = annots[prot]
         if len(real_annots) == 0:
             continue
@@ -443,7 +443,7 @@
             p += precision
             r += recall
             f += 2 * precision * recall / (precision + recall)
-    print(f / total, p / total, r / total)
+    print((f / total, p / total, r / total))


 def main(*args, **kwargs):
--- clustering.py       (original)
+++ clustering.py       (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+

 import click as ck
 import numpy as np
@@ -45,11 +45,11 @@
             if prot2 not in sim:
                 sim[prot2] = {}
             sim[prot2][prot1] = score
-    prots = sim.keys()
+    prots = list(sim.keys())
     n = len(prots)
     X = np.zeros((n, n), dtype=np.float32)
-    for i in xrange(n):
-        for j in xrange(i + 1, n):
+    for i in range(n):
+        for j in range(i + 1, n):
             if prots[j] in sim[prots[i]]:
                 score = sim[prots[i]][prots[j]]
                 X[i, j] = score
--- deeponto.py (original)
+++ deeponto.py (refactored)
@@ -121,7 +121,7 @@
         return values

     def get_values(data_frame):
-        print(data_frame['labels'].values.shape)
+        print((data_frame['labels'].values.shape))
         labels = reshape(data_frame['labels'].values)
         ngrams = sequence.pad_sequences(
             data_frame['ngrams'].values, maxlen=MAXLEN)
@@ -425,7 +425,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -456,13 +456,13 @@

 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -483,8 +483,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -500,7 +500,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -544,7 +544,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -559,7 +559,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- evaluation.py       (original)
+++ evaluation.py       (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+

 import os
 import numpy as np
@@ -45,7 +45,7 @@
     # print(len(preds_dict))
     target_ids = list()
     predictions = list()
-    for key, val in preds_dict.iteritems():
+    for key, val in preds_dict.items():
         target_ids.append(key)
         predictions.append(val)

     # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions})
@@ -62,7 +62,7 @@
     target_ids = list()
     labels = list()
     go_ids = list()
-    for target, gos in targets.iteritems():
+    for target, gos in targets.items():
         go_set = set()
         for go_id in gos:
             if go_id in all_functions:
@@ -145,7 +145,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         # predictions = list()
--- get_data.py (original)
+++ get_data.py (refactored)
@@ -37,7 +37,7 @@
     functions = func_df['functions'].values
     global func_set
     func_set = get_go_set(go, GO_ID)
-    print len(functions)
+    print(len(functions))
     global go_indexes
     go_indexes = dict()
     for ind, go_id in enumerate(functions):
@@ -51,8 +51,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
     gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))
     proteins = list()
     gos = list()
     labels = list()
@@ -87,7 +87,7 @@
         seq = row['sequences']
         sequences.append(seq)
         grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
-        for i in xrange(len(seq) - gram_len + 1):
+        for i in range(len(seq) - gram_len + 1):
             grams[i] = vocab[seq[i: (i + gram_len)]]
         ngrams.append(grams)
         label = np.zeros((len(functions),), dtype='int32')
@@ -102,7 +102,7 @@
         'labels': labels,
         'gos': gos,
         'sequences': sequences})
-    print(len(res_df))
+    print((len(res_df)))
     return res_df


@@ -127,7 +127,7 @@
         if not isinstance(row['embeddings'], np.ndarray):
             row['embeddings'] = np.zeros((256,), dtype='float32')
             missing_rep += 1
-    print('Missing network reps:', missing_rep)
+    print(('Missing network reps:', missing_rep))
     df = df[df['orgs'] == '9606']
     # index = df.index.values
     # np.random.seed(seed=0)
--- get_data_all.py     (original)
+++ get_data_all.py     (refactored)
@@ -23,7 +23,7 @@
     global SPLIT
     SPLIT = split
     global GO_IDS
-    GO_IDS = FUNC_DICT.values()
+    GO_IDS = list(FUNC_DICT.values())
     global go
     go = get_gene_ontology('go.obo')
     func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl')
@@ -38,7 +38,7 @@
         get_go_set(go, GO_IDS[0])
         | get_go_set(go, GO_IDS[1])
         | get_go_set(go, GO_IDS[2]))
-    print len(functions)
+    print(len(functions))
     global go_indexes
     go_indexes = dict()
     for ind, go_id in enumerate(functions):
@@ -52,8 +52,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
     gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))
     proteins = list()
     gos = list()
     labels = list()
@@ -89,7 +89,7 @@
         seq = row['sequences']
         sequences.append(seq)
         grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
-        for i in xrange(len(seq) - gram_len + 1):
+        for i in range(len(seq) - gram_len + 1):
             grams[i] = vocab[seq[i: (i + gram_len)]]
         ngrams.append(grams)
         label = np.zeros((len(functions),), dtype='int32')
@@ -104,7 +104,7 @@
         'labels': labels,
         'gos': gos,
         'sequences': sequences})
-    print(len(res_df))
+    print((len(res_df)))
     return res_df


@@ -129,7 +129,7 @@
         if not isinstance(row['embeddings'], np.ndarray):
             row['embeddings'] = np.zeros((256,), dtype='float32')
             missing_rep += 1
-    print('Missing network reps:', missing_rep)
+    print(('Missing network reps:', missing_rep))
     index = df.index.values
     np.random.seed(seed=0)
     np.random.shuffle(index)
@@ -139,7 +139,7 @@
     # prots_df = pd.read_pickle('data/swiss/clusters.pkl')
     # train_df = df[df['proteins'].isin(prots_df['proteins'])]
     # test_df = df[~df['proteins'].isin(prots_df['proteins'])]
-    print(len(train_df), len(test_df))
+    print((len(train_df), len(test_df)))
     train_df.to_pickle(DATA_ROOT + 'train.pkl')
     test_df.to_pickle(DATA_ROOT + 'test.pkl')

--- get_functions.py    (original)
+++ get_functions.py    (refactored)
@@ -34,7 +34,7 @@
     dfs(GO_ID)
     functions.remove(GO_ID)
     functions = list(functions)
-    print(len(functions))
+    print((len(functions)))
     global func_set
     func_set = set(functions)
     global go_indexes
@@ -75,10 +75,10 @@
     for go_id in functions:
         if go_id in annots and annots[go_id] >= annot_num:
             filtered.append(go_id)
-    print len(filtered)
+    print(len(filtered))
     df = pd.DataFrame({'functions': filtered})
     df.to_pickle(DATA_ROOT + FUNCTION + '.pkl')
-    print 'Saved ' + DATA_ROOT + FUNCTION + '.pkl'
+    print('Saved ' + DATA_ROOT + FUNCTION + '.pkl')


 if __name__ == '__main__':
--- hierarchical.py     (original)
+++ hierarchical.py     (refactored)
@@ -1,8 +1,8 @@
 #!/usr/bin/env python

-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+
+
+

 import os
 import sys
@@ -75,7 +75,7 @@
         return values

     def pad_sequences(values, max_len=1000):
-        for i in xrange(len(values)):
+        for i in range(len(values)):
             padded = np.zeros((max_len,), dtype='int32')
             padded[:len(values[i])] = values[i][:]
             values[i] = padded
@@ -137,16 +137,16 @@
         update = trainer.minimize(loss)

         outputs = [loss]
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             go_id = functions[i]
             outputs.append(layers[go_id]['output'])

     with tf.Session() as sess:
         tf.global_variables_initializer().run()
-        for epoch in xrange(epochs):
+        for epoch in range(epochs):
             print('Epoch %d/%d' % (epoch, epochs))
             sum_loss = 0.0
-            with ck.progressbar(xrange(train_steps)) as bar:
+            with ck.progressbar(range(train_steps)) as bar:
                 for step in bar:
                     offset = step * batch_size
                     batch_input1 = train_input1[offset:(offset + batch_size)]
@@ -168,7 +168,7 @@
             predictions = np.empty(
                 (valid_n, len(functions)), dtype='float32')

-            for step in xrange(valid_steps):
+            for step in range(valid_steps):
                 offset = step * batch_size
                 feed_dict = {
                     placeholders['input1']: valid_input1[offset:(offset + batch_size)],
@@ -190,7 +190,7 @@
         sum_loss = 0.0
         predictions = np.empty(
             (test_n, len(functions)), dtype='float32')
-        for step in xrange(test_steps):
+        for step in range(test_steps):
             offset = step * batch_size
             feed_dict = {
                 placeholders['input1']: test_input1[offset:(offset + batch_size)],
--- interactions.py     (original)
+++ interactions.py     (refactored)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import os
 import sys
 import numpy as np
@@ -29,8 +29,8 @@
     proteins = list(index.keys())
     new_scores = list()
     interactions = list()
-    for i in xrange(n):
-        for j in xrange(n):
+    for i in range(n):
+        for j in range(n):
             x = index[proteins[i]]
             y = index[proteins[j]]
             new_scores.append(scores[m * x + y])
--- mapping.py  (original)
+++ mapping.py  (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import pandas as pd
 import numpy as np
 from utils import EXP_CODES, get_gene_ontology
@@ -70,7 +70,7 @@
     #         annots_dict[prot_id] = set(row['annots'])
     proteins = list()
     annots = list()
-    for prot, gos in annots_dict.iteritems():
+    for prot, gos in annots_dict.items():
         annots.append(list(gos))
         proteins.append(prot)
     annots_df = pd.DataFrame({
@@ -105,7 +105,7 @@

     proteins = list()

-    for access, gos in goa.iteritems():
+    for access, gos in goa.items():
         if access in prots:
             accessions.append(access)
             proteins.append(prots[access])
@@ -177,11 +177,11 @@
             if st_id in mapping:
                 ac_id = mapping[st_id]
                 embeds[ac_id] = np.array(
-                    map(float, it[1:]), dtype='float32')
+                    list(map(float, it[1:])), dtype='float32')

     df = pd.DataFrame({
-        'accessions': embeds.keys(),
-        'embeddings': embeds.values()})
+        'accessions': list(embeds.keys()),
+        'embeddings': list(embeds.values())})
     print(len(df))
     df.to_pickle('data/graph_new_embeddings.pkl')

@@ -257,7 +257,7 @@
             preds[target_id].append(go_id)
     targets = list()
     predicts = list()
-    for t, p in preds.iteritems():
+    for t, p in preds.items():
         targets.append(t)
         predicts.append(p)
     df = pd.DataFrame({'targets': targets, 'predictions': predicts})
@@ -301,7 +301,7 @@
         if isinstance(row['string'], str):
             st_ids[row['accessions']] = row['string']
     with open('data/human_annotations.tab', 'w') as f:
-        for acc, gos in annots.iteritems():
+        for acc, gos in annots.items():
             if acc in st_ids:
                 f.write(st_ids[acc])
                 for go_id in gos:
@@ -362,7 +362,7 @@
             if it[1] in mapping:
                 for uni_id in mapping[it[1]]:
                     fw.write('UniProtKB\t' + uni_id)
-                    for i in xrange(2, len(it)):
+                    for i in range(2, len(it)):
                         fw.write('\t' + it[i])
                     fw.write('\n')

@@ -389,7 +389,7 @@
         for line in f:
             it = line.strip().split('\t')
             w.write(it[0] + '\t' + it[1] + '\t' + seqs[it[0]] + '\t' + it[2])
-            for i in xrange(3, len(it)):
+            for i in range(3, len(it)):
                 w.write('; ' + it[i])
             w.write('\n')
     w.close()
--- ngrams.py   (original)
+++ ngrams.py   (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import click as ck
 import pandas as pd
 from aaindex import is_ok
@@ -15,7 +15,7 @@
     seqs = get_sequences()
     ngrams = set()
     for seq in seqs:
-        for i in xrange(len(seq) - length + 1):
+        for i in range(len(seq) - length + 1):
             ngrams.add(seq[i: (i + length)])
     ngrams = list(sorted(ngrams))
     print(ngrams[:100])
--- nn_hierarchical_all.py      (original)
+++ nn_hierarchical_all.py      (refactored)
@@ -65,7 +65,7 @@
 @ck.option('--train', is_flag=True)
 def main(device, org, train):
     global GO_IDS
-    GO_IDS = FUNC_DICT.values()
+    GO_IDS = list(FUNC_DICT.values())
     global go
     go = get_gene_ontology('go.obo')
     global ORG
@@ -366,14 +366,14 @@
     logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
     logging.info('ROC AUC: \t %f ' % (roc_auc, ))
     logging.info('MCC: \t %f ' % (mcc, ))
-    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (
-        f, p, r, roc_auc, mcc))
+    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
+        f, p, r, roc_auc, mcc)))
     # return f
     # logging.info('Inconsistent predictions: %d' % incon)
     # logging.info('Saving the predictions')
     proteins = test_df['proteins']
     predictions = list()
-    for i in xrange(preds_max.shape[0]):
+    for i in range(preds_max.shape[0]):
         predictions.append(preds_max[i])
     df = pd.DataFrame(
         {
@@ -428,7 +428,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -453,19 +453,19 @@
             pr /= p_total
             if pr + rc > 0:
                 f = 2 * pr * rc / (pr + rc)
-                print('%s\t%d\t%f\t%f\t%f' % (
-                    ipro_id, len(labels), f, pr, rc))
+                print(('%s\t%d\t%f\t%f\t%f' % (
+                    ipro_id, len(labels), f, pr, rc)))


 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -486,8 +486,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -508,7 +508,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -555,7 +555,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -570,7 +570,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- nn_hierarchical_network.py  (original)
+++ nn_hierarchical_network.py  (refactored)
@@ -111,7 +111,7 @@
         nb_filters = [16, 32, 64, 128]
         nb_convs = [1, 2, 3, 4]
         nb_dense = [1, 2, 3, 4]
-        for i in xrange(param * 32, param * 32 + 32):
+        for i in range(param * 32, param * 32 + 32):
             dim = i % 4
             i = i / 4
             nb_fil = i % 4
@@ -189,7 +189,7 @@
         embedding_dims,
         input_length=MAXLEN,
         dropout=params['embedding_dropout']))
-    for i in xrange(params['nb_conv']):
+    for i in range(params['nb_conv']):
         model.add(Convolution1D(
             nb_filter=params['nb_filter'],
             filter_length=params['filter_length'],
@@ -289,7 +289,7 @@
     net = merge(
         [feature_model, inputs2], mode='concat',
         concat_axis=1, name='merged')
-    for i in xrange(params['nb_dense']):
+    for i in range(params['nb_dense']):
         net = Dense(params['fc_output'], activation='relu')(net)
     layers = get_layers(net)
     output_models = []
@@ -382,14 +382,14 @@
     logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
     logging.info('ROC AUC: \t %f ' % (roc_auc, ))
     logging.info('MCC: \t %f ' % (mcc, ))
-    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (
-        f, p, r, roc_auc, mcc))
+    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
+        f, p, r, roc_auc, mcc)))
     # return f
     # logging.info('Inconsistent predictions: %d' % incon)
     # logging.info('Saving the predictions')
     proteins = test_df['proteins']
     predictions = list()
-    for i in xrange(preds_max.shape[0]):
+    for i in range(preds_max.shape[0]):
         predictions.append(preds_max[i])
     df = pd.DataFrame(
         {
@@ -444,7 +444,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -469,19 +469,19 @@
             pr /= p_total
             if pr + rc > 0:
                 f = 2 * pr * rc / (pr + rc)
-                print('%s\t%d\t%f\t%f\t%f' % (
-                    ipro_id, len(labels), f, pr, rc))
+                print(('%s\t%d\t%f\t%f\t%f' % (
+                    ipro_id, len(labels), f, pr, rc)))


 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -502,8 +502,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -524,7 +524,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -570,7 +570,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -585,7 +585,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- nn_hierarchical_seq.py      (original)
+++ nn_hierarchical_seq.py      (refactored)
@@ -129,7 +129,7 @@
         return values - mn

     def get_values(data_frame):
-        print(data_frame['labels'].values.shape)
+        print((data_frame['labels'].values.shape))
         labels = reshape(data_frame['labels'].values)
         ngrams = sequence.pad_sequences(
             data_frame['ngrams'].values, maxlen=MAXLEN)
@@ -356,7 +356,7 @@

     model = model.layers[1]
     output = model.predict_generator(test_generator, val_samples=len(test_data))
-    print(output.shape)
+    print((output.shape))
     return
     logging.info('Predicting')
     preds = model.predict_generator(
@@ -379,8 +379,8 @@
     logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
     logging.info('ROC AUC: \t %f ' % (roc_auc, ))
     logging.info('MCC: \t %f ' % (mcc, ))
-    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (
-        f, p, r, roc_auc, mcc))
+    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
+        f, p, r, roc_auc, mcc)))
     # logging.info('Inconsistent predictions: %d' % incon)
     # logging.info('Saving the predictions')
     # proteins = test_df['proteins']
@@ -440,7 +440,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -471,13 +471,13 @@

 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -498,8 +498,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -520,7 +520,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -566,7 +566,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -581,7 +581,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- plots.py    (original)
+++ plots.py    (refactored)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import os
 import sys
 import numpy as np
@@ -64,7 +64,7 @@
             index.append(i)
     df = df.iloc[index]
     print(len(df))
-    lens = map(len, df['sequences'])
+    lens = list(map(len, df['sequences']))
     c = 0
     for i in lens:
         if i <= 1002:
@@ -72,7 +72,7 @@
     print(c)
     h = np.histogram(lens, bins=(
         0, 500, 1000, 1500, 2000, 40000))
-    plt.bar(range(5),
+    plt.bar(list(range(5)),
         h[0], width=1, facecolor='green')
     titles = ['<=500', '<=1000', '<=1500', '<=2000', '>2000']
     plt.xticks(np.arange(0.5, 5.5, 1), titles)
--- predict.py  (original)
+++ predict.py  (refactored)
@@ -132,12 +132,12 @@
     # logging.info('Inconsistent predictions: %d' % incon)

     predictions = list()
-    for i in xrange(len(targets)):
+    for i in range(len(targets)):
         predictions.append(preds[i])
     df = pd.DataFrame({
         'targets': targets,
         'predictions': predictions})
-    print(len(df))
+    print((len(df)))
     df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl')
     logging.info('Done in %d sec' % (time.time() - start_time))

@@ -149,7 +149,7 @@
     for i, row in df.iterrows():
         preds = row['predictions']
         go_ids = list()
-        for i in xrange(len(preds)):
+        for i in range(len(preds)):
             if preds[i] >= threshold:
                 go_ids.append(functions[i])
         gos.append(filter_specific(go, go_ids))
--- predict_all.py      (original)
+++ predict_all.py      (refactored)
@@ -85,11 +85,11 @@
                                 seqs = list()
                                 info = list()
                         else:
-                            print('Ignoring sequence {} because its length > 1002'
-                              .format(inf))
+                            print(('Ignoring sequence {} because its length > 1002'
+                              .format(inf)))
                     else:
-                        print('Ignoring sequence {} because of ambigious AA'
-                              .format(inf))
+                        print(('Ignoring sequence {} because of ambigious AA'
+                              .format(inf)))

                     seq = ''
                 inf = line[1:].split()[0]
@@ -108,7 +108,7 @@
         p = Popen(['diamond', 'blastp', '-d', 'data/embeddings',
                    '--max-target-seqs', '1', '--min-score', '60',
                    '--outfmt', '6', 'qseqid', 'sseqid'], stdin=PIPE, stdout=PIPE)
-        for i in xrange(n):
+        for i in range(n):
             p.stdin.write('>' + str(i) + '\n' + sequences[i] + '\n')
         p.stdin.close()

@@ -119,13 +119,13 @@
                 if len(it) == 2:
                     prot_ids[it[1]] = int(it[0])

-    prots = embed_df[embed_df['accessions'].isin(prot_ids.keys())]
+    prots = embed_df[embed_df['accessions'].isin(list(prot_ids.keys()))]
     for i, row in prots.iterrows():
         embeds[prot_ids[row['accessions']], :] = row['embeddings']

-    for i in xrange(len(sequences)):
+    for i in range(len(sequences)):
         seq = sequences[i]
-        for j in xrange(min(MAXLEN, len(seq)) - gram_len + 1):
+        for j in range(min(MAXLEN, len(seq)) - gram_len + 1):
             data[i, j] = vocab[seq[j: (j + gram_len)]]
     return [data, embeds]

@@ -133,13 +133,13 @@
 def predict(data, model, model_name, functions, threshold, batch_size):
     n = data[0].shape[0]
     result = list()
-    for i in xrange(n):
+    for i in range(n):
         result.append(list())
     predictions = model.predict(
         data, batch_size=batch_size, verbose=1)
-    for i in xrange(n):
+    for i in range(n):
         pred = (predictions[i] >= threshold).astype('int32')
-        for j in xrange(len(functions)):
+        for j in range(len(functions)):
             if pred[j] == 1:
                 result[i].append(model_name + '_' + functions[j] + '|' + '%.2f' % predictions[i][j])
     return result
@@ -157,8 +157,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
         gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))
     threshold = 0.3
     # sequences = ['MKKVLVINGPNLNLLGIREKNIYGSVSYEDVLKSISRKAQELGFEVEFFQSNHEGEIIDKIHRAYFEKVDAIIINPGAYTHYSYAIHDAIKAVNIPTIEVHISNIHAREEFRHKSVIAPACTGQISGFGIKSYIIALYALKEILD']
     # data = get_data(sequences)
@@ -167,7 +167,7 @@
         df = pd.read_pickle('data/models/%s.pkl' % onto)
         functions = df['functions']
         models.append((model, functions))
-        print 'Model %s initialized.' % onto
+        print('Model %s initialized.' % onto)
         # result = predict(data, model, functions, threshold)
         # print result

@@ -180,15 +180,15 @@
     data = get_data(sequences, prot_ids)
     result = list()
     n = len(sequences)
-    for i in xrange(n):
+    for i in range(n):
         result.append([])
     for i in range(len(models)):
         model, functions = models[i]
-        print 'Running predictions for model %s' % funcs[i]
+        print('Running predictions for model %s' % funcs[i])
         res = predict(data, model, funcs[i], functions, threshold, batch_size)
-        for j in xrange(n):
+        for j in range(n):
             result[j] += res[j]
-    print('Predictions time: {}'.format(time.time() - start_time))
+    print(('Predictions time: {}'.format(time.time() - start_time)))
     return result


--- stats.py    (original)
+++ stats.py    (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import numpy as np
 import pandas as pd
 import click as ck
--- text.py     (original)
+++ text.py     (refactored)
@@ -1,7 +1,7 @@
 #!/usr/bin/env python

-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import sys
 import os
 import pandas as pd
@@ -51,7 +51,7 @@
             items = line.strip().split('\t')
             if items[0] in uni_ids:
                 text_reps[uni_ids[items[0]]] = np.array(
-                    map(float, items[1:]), dtype='float32')
+                    list(map(float, items[1:])), dtype='float32')
     return text_reps


--- tf_utils.py (original)
+++ tf_utils.py (refactored)
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+
+
+
 import tensorflow as tf
 import numpy as np

--- utils.py    (original)
+++ utils.py    (refactored)
@@ -87,10 +87,10 @@
                     obj['is_obsolete'] = True
     if obj is not None:
         go[obj['id']] = obj
-    for go_id in go.keys():
+    for go_id in list(go.keys()):
         if go[go_id]['is_obsolete']:
             del go[go_id]
-    for go_id, val in go.iteritems():
+    for go_id, val in go.items():
         if 'children' not in val:
             val['children'] = set()
         for p_id in val['is_a']:
@@ -247,16 +247,16 @@
         else:
             if self.monitor_op(current, self.best):
                 if self.verbose > 0:
-                    print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+                    print(('Epoch %05d: %s improved from %0.5f to %0.5f,'
                           ' saving model to %s'
                           % (epoch, self.monitor, self.best,
-                             current, filepath))
+                             current, filepath)))
                 self.best = current
                 save_model_weights(self.model, filepath)
             else:
                 if self.verbose > 0:
-                    print('Epoch %05d: %s did not improve' %
-                          (epoch, self.monitor))
+                    print(('Epoch %05d: %s did not improve' %
+                          (epoch, self.monitor)))


 class DataGenerator(object):
@@ -275,12 +275,12 @@
         self.has_targets = targets is not None

     def __next__(self):
-        return self.next()
+        return next(self)

     def reset(self):
         self.start = 0

-    def next(self):
+    def __next__(self):
         if self.start < self.size:
             # output = []
             # if self.has_targets:
@@ -304,7 +304,7 @@
             return res_inputs
         else:
             self.reset()
-            return self.next()
+            return next(self)


 if __name__ == '__main__':

@oeway
Copy link

oeway commented Sep 29, 2019

I found there two additional changes need to made in predict_all.py:

  1. change line 112, p.stdin.write('>' + str(i) + '\n' + sequences[i] + '\n') to

p.stdin.write(bytes('>' + str(i) + '\n' + sequences[i] + '\n', 'utf-8'))

  1. change line 118, it = line.strip().split('\t') to

it = line.decode('utf-8').strip().split('\t')

It would be great if someone can test a python3 version.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants