Update code for Python 3 (Python 2.7 no longer maintained after January 1, 2020) #20

bdklahn · 2019-08-06T16:04:06Z

print function
xrange to range
many things now return iterators/generators vs. list objects.
etc., etc.

2to3 script can help

bdklahn · 2019-08-06T16:24:49Z

output of 2to3-3.5:

--- blast.py    (original)
+++ blast.py    (refactored)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import click as ck
 import pandas as pd
 import numpy as np
@@ -104,7 +104,7 @@
     length = 60
     n = len(sequence)
     res = ''
-    for i in xrange(0, n, length):
+    for i in range(0, n, length):
         res += sequence[i: i + length] + '\n'
     return res

--- cafa.py     (original)
+++ cafa.py     (refactored)
@@ -170,8 +170,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
     gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))

     with open('data/eshark/targets.txt') as f:
         for line in f:
@@ -185,7 +185,7 @@
                 else:
                     proteins.append('')
                 grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
-                for i in xrange(len(seq) - gram_len + 1):
+                for i in range(len(seq) - gram_len + 1):
                     grams[i] = vocab[seq[i: (i + gram_len)]]
                 ngrams.append(grams)

@@ -194,7 +194,7 @@
         'accessions': proteins,
         'ngrams': ngrams})

-    print(len(df))
+    print((len(df)))
     embed_df = pd.read_pickle('data/graph_new_embeddings.pkl')

     df = pd.merge(df, embed_df, on='accessions', how='left')
@@ -253,9 +253,9 @@
     df = pd.merge(targets, mf_preds, on='targets')
     df = pd.merge(df, cc_preds, on='targets')
     df = pd.merge(df, bp_preds, on='targets')
-    mf = map(str, mf_df['functions'].values)
-    cc = map(str, cc_df['functions'].values)
-    bp = map(str, bp_df['functions'].values)
+    mf = list(map(str, mf_df['functions'].values))
+    cc = list(map(str, cc_df['functions'].values))
+    bp = list(map(str, bp_df['functions'].values))
     taxons = set(df['orgs'].values)
     annots = get_real_annotations()
     for tax_id in taxons:
@@ -289,8 +289,8 @@
             f.write('AUTHOR CBRC_BORG\n')
             f.write('MODEL 3\n')
             f.write('KEYWORDS sequence properties, machine learning.\n')
-            for target_id, annots in results.iteritems():
-                for go_id, score in annots.iteritems():
+            for target_id, annots in results.items():
+                for go_id, score in annots.items():
                     sc = '%.2f' % score
                     f.write(target_id + '\t' + go_id + '\t' + sc + '\n')
             f.write('END\n')
@@ -308,7 +308,7 @@
         prot_id = row['proteins']
         if prot_id not in preds:
             preds[prot_id] = set()
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             if row['predictions'][i] == 1:
                 preds[prot_id].add(functions[i])
         if prot_id not in annots:
@@ -321,7 +321,7 @@
         prot_id = row['proteins']
         if prot_id not in preds:
             preds[prot_id] = set()
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             if row['predictions'][i] == 1:
                 preds[prot_id].add(functions[i])
         if prot_id not in annots:
@@ -334,7 +334,7 @@
         prot_id = row['proteins']
         if prot_id not in preds:
             preds[prot_id] = set()
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             if row['predictions'][i] == 1:
                 preds[prot_id].add(functions[i])
         if prot_id not in annots:
@@ -349,7 +349,7 @@
             anchestors.remove(go_id)
             go_set -= anchestors

-    proteins = sorted(annots.keys(), key=lambda x: (
+    proteins = sorted(list(annots.keys()), key=lambda x: (
         x.split('_')[1], x.split('_')[0]))
     with open(root + 'test_predictions.tab', 'w') as f:
         for prot_id in proteins:
@@ -427,7 +427,7 @@
     p = 0.0
     r = 0.0
     f = 0.0
-    for prot, pred_annots in preds.iteritems():
+    for prot, pred_annots in preds.items():
         real_annots = annots[prot]
         if len(real_annots) == 0:
             continue
@@ -443,7 +443,7 @@
             p += precision
             r += recall
             f += 2 * precision * recall / (precision + recall)
-    print(f / total, p / total, r / total)
+    print((f / total, p / total, r / total))


 def main(*args, **kwargs):
--- clustering.py       (original)
+++ clustering.py       (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+

 import click as ck
 import numpy as np
@@ -45,11 +45,11 @@
             if prot2 not in sim:
                 sim[prot2] = {}
             sim[prot2][prot1] = score
-    prots = sim.keys()
+    prots = list(sim.keys())
     n = len(prots)
     X = np.zeros((n, n), dtype=np.float32)
-    for i in xrange(n):
-        for j in xrange(i + 1, n):
+    for i in range(n):
+        for j in range(i + 1, n):
             if prots[j] in sim[prots[i]]:
                 score = sim[prots[i]][prots[j]]
                 X[i, j] = score
--- deeponto.py (original)
+++ deeponto.py (refactored)
@@ -121,7 +121,7 @@
         return values

     def get_values(data_frame):
-        print(data_frame['labels'].values.shape)
+        print((data_frame['labels'].values.shape))
         labels = reshape(data_frame['labels'].values)
         ngrams = sequence.pad_sequences(
             data_frame['ngrams'].values, maxlen=MAXLEN)
@@ -425,7 +425,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -456,13 +456,13 @@

 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -483,8 +483,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -500,7 +500,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -544,7 +544,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -559,7 +559,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- evaluation.py       (original)
+++ evaluation.py       (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+

 import os
 import numpy as np
@@ -45,7 +45,7 @@
     # print(len(preds_dict))
     target_ids = list()
     predictions = list()
-    for key, val in preds_dict.iteritems():
+    for key, val in preds_dict.items():
         target_ids.append(key)
         predictions.append(val)

     # pred_df = pd.DataFrame({'targets': target_ids, 'predictions': predictions})
@@ -62,7 +62,7 @@
     target_ids = list()
     labels = list()
     go_ids = list()
-    for target, gos in targets.iteritems():
+    for target, gos in targets.items():
         go_set = set()
         for go_id in gos:
             if go_id in all_functions:
@@ -145,7 +145,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         # predictions = list()
--- get_data.py (original)
+++ get_data.py (refactored)
@@ -37,7 +37,7 @@
     functions = func_df['functions'].values
     global func_set
     func_set = get_go_set(go, GO_ID)
-    print len(functions)
+    print(len(functions))
     global go_indexes
     go_indexes = dict()
     for ind, go_id in enumerate(functions):
@@ -51,8 +51,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
     gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))
     proteins = list()
     gos = list()
     labels = list()
@@ -87,7 +87,7 @@
         seq = row['sequences']
         sequences.append(seq)
         grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
-        for i in xrange(len(seq) - gram_len + 1):
+        for i in range(len(seq) - gram_len + 1):
             grams[i] = vocab[seq[i: (i + gram_len)]]
         ngrams.append(grams)
         label = np.zeros((len(functions),), dtype='int32')
@@ -102,7 +102,7 @@
         'labels': labels,
         'gos': gos,
         'sequences': sequences})
-    print(len(res_df))
+    print((len(res_df)))
     return res_df


@@ -127,7 +127,7 @@
         if not isinstance(row['embeddings'], np.ndarray):
             row['embeddings'] = np.zeros((256,), dtype='float32')
             missing_rep += 1
-    print('Missing network reps:', missing_rep)
+    print(('Missing network reps:', missing_rep))
     df = df[df['orgs'] == '9606']
     # index = df.index.values
     # np.random.seed(seed=0)
--- get_data_all.py     (original)
+++ get_data_all.py     (refactored)
@@ -23,7 +23,7 @@
     global SPLIT
     SPLIT = split
     global GO_IDS
-    GO_IDS = FUNC_DICT.values()
+    GO_IDS = list(FUNC_DICT.values())
     global go
     go = get_gene_ontology('go.obo')
     func_df = pd.read_pickle(DATA_ROOT + 'bp.pkl')
@@ -38,7 +38,7 @@
         get_go_set(go, GO_IDS[0])
         | get_go_set(go, GO_IDS[1])
         | get_go_set(go, GO_IDS[2]))
-    print len(functions)
+    print(len(functions))
     global go_indexes
     go_indexes = dict()
     for ind, go_id in enumerate(functions):
@@ -52,8 +52,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
     gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))
     proteins = list()
     gos = list()
     labels = list()
@@ -89,7 +89,7 @@
         seq = row['sequences']
         sequences.append(seq)
         grams = np.zeros((len(seq) - gram_len + 1, ), dtype='int32')
-        for i in xrange(len(seq) - gram_len + 1):
+        for i in range(len(seq) - gram_len + 1):
             grams[i] = vocab[seq[i: (i + gram_len)]]
         ngrams.append(grams)
         label = np.zeros((len(functions),), dtype='int32')
@@ -104,7 +104,7 @@
         'labels': labels,
         'gos': gos,
         'sequences': sequences})
-    print(len(res_df))
+    print((len(res_df)))
     return res_df


@@ -129,7 +129,7 @@
         if not isinstance(row['embeddings'], np.ndarray):
             row['embeddings'] = np.zeros((256,), dtype='float32')
             missing_rep += 1
-    print('Missing network reps:', missing_rep)
+    print(('Missing network reps:', missing_rep))
     index = df.index.values
     np.random.seed(seed=0)
     np.random.shuffle(index)
@@ -139,7 +139,7 @@
     # prots_df = pd.read_pickle('data/swiss/clusters.pkl')
     # train_df = df[df['proteins'].isin(prots_df['proteins'])]
     # test_df = df[~df['proteins'].isin(prots_df['proteins'])]
-    print(len(train_df), len(test_df))
+    print((len(train_df), len(test_df)))
     train_df.to_pickle(DATA_ROOT + 'train.pkl')
     test_df.to_pickle(DATA_ROOT + 'test.pkl')

--- get_functions.py    (original)
+++ get_functions.py    (refactored)
@@ -34,7 +34,7 @@
     dfs(GO_ID)
     functions.remove(GO_ID)
     functions = list(functions)
-    print(len(functions))
+    print((len(functions)))
     global func_set
     func_set = set(functions)
     global go_indexes
@@ -75,10 +75,10 @@
     for go_id in functions:
         if go_id in annots and annots[go_id] >= annot_num:
             filtered.append(go_id)
-    print len(filtered)
+    print(len(filtered))
     df = pd.DataFrame({'functions': filtered})
     df.to_pickle(DATA_ROOT + FUNCTION + '.pkl')
-    print 'Saved ' + DATA_ROOT + FUNCTION + '.pkl'
+    print('Saved ' + DATA_ROOT + FUNCTION + '.pkl')


 if __name__ == '__main__':
--- hierarchical.py     (original)
+++ hierarchical.py     (refactored)
@@ -1,8 +1,8 @@
 #!/usr/bin/env python

-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+
+
+

 import os
 import sys
@@ -75,7 +75,7 @@
         return values

     def pad_sequences(values, max_len=1000):
-        for i in xrange(len(values)):
+        for i in range(len(values)):
             padded = np.zeros((max_len,), dtype='int32')
             padded[:len(values[i])] = values[i][:]
             values[i] = padded
@@ -137,16 +137,16 @@
         update = trainer.minimize(loss)

         outputs = [loss]
-        for i in xrange(len(functions)):
+        for i in range(len(functions)):
             go_id = functions[i]
             outputs.append(layers[go_id]['output'])

     with tf.Session() as sess:
         tf.global_variables_initializer().run()
-        for epoch in xrange(epochs):
+        for epoch in range(epochs):
             print('Epoch %d/%d' % (epoch, epochs))
             sum_loss = 0.0
-            with ck.progressbar(xrange(train_steps)) as bar:
+            with ck.progressbar(range(train_steps)) as bar:
                 for step in bar:
                     offset = step * batch_size
                     batch_input1 = train_input1[offset:(offset + batch_size)]
@@ -168,7 +168,7 @@
             predictions = np.empty(
                 (valid_n, len(functions)), dtype='float32')

-            for step in xrange(valid_steps):
+            for step in range(valid_steps):
                 offset = step * batch_size
                 feed_dict = {
                     placeholders['input1']: valid_input1[offset:(offset + batch_size)],
@@ -190,7 +190,7 @@
         sum_loss = 0.0
         predictions = np.empty(
             (test_n, len(functions)), dtype='float32')
-        for step in xrange(test_steps):
+        for step in range(test_steps):
             offset = step * batch_size
             feed_dict = {
                 placeholders['input1']: test_input1[offset:(offset + batch_size)],
--- interactions.py     (original)
+++ interactions.py     (refactored)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import os
 import sys
 import numpy as np
@@ -29,8 +29,8 @@
     proteins = list(index.keys())
     new_scores = list()
     interactions = list()
-    for i in xrange(n):
-        for j in xrange(n):
+    for i in range(n):
+        for j in range(n):
             x = index[proteins[i]]
             y = index[proteins[j]]
             new_scores.append(scores[m * x + y])
--- mapping.py  (original)
+++ mapping.py  (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import pandas as pd
 import numpy as np
 from utils import EXP_CODES, get_gene_ontology
@@ -70,7 +70,7 @@
     #         annots_dict[prot_id] = set(row['annots'])
     proteins = list()
     annots = list()
-    for prot, gos in annots_dict.iteritems():
+    for prot, gos in annots_dict.items():
         annots.append(list(gos))
         proteins.append(prot)
     annots_df = pd.DataFrame({
@@ -105,7 +105,7 @@

     proteins = list()

-    for access, gos in goa.iteritems():
+    for access, gos in goa.items():
         if access in prots:
             accessions.append(access)
             proteins.append(prots[access])
@@ -177,11 +177,11 @@
             if st_id in mapping:
                 ac_id = mapping[st_id]
                 embeds[ac_id] = np.array(
-                    map(float, it[1:]), dtype='float32')
+                    list(map(float, it[1:])), dtype='float32')

     df = pd.DataFrame({
-        'accessions': embeds.keys(),
-        'embeddings': embeds.values()})
+        'accessions': list(embeds.keys()),
+        'embeddings': list(embeds.values())})
     print(len(df))
     df.to_pickle('data/graph_new_embeddings.pkl')

@@ -257,7 +257,7 @@
             preds[target_id].append(go_id)
     targets = list()
     predicts = list()
-    for t, p in preds.iteritems():
+    for t, p in preds.items():
         targets.append(t)
         predicts.append(p)
     df = pd.DataFrame({'targets': targets, 'predictions': predicts})
@@ -301,7 +301,7 @@
         if isinstance(row['string'], str):
             st_ids[row['accessions']] = row['string']
     with open('data/human_annotations.tab', 'w') as f:
-        for acc, gos in annots.iteritems():
+        for acc, gos in annots.items():
             if acc in st_ids:
                 f.write(st_ids[acc])
                 for go_id in gos:
@@ -362,7 +362,7 @@
             if it[1] in mapping:
                 for uni_id in mapping[it[1]]:
                     fw.write('UniProtKB\t' + uni_id)
-                    for i in xrange(2, len(it)):
+                    for i in range(2, len(it)):
                         fw.write('\t' + it[i])
                     fw.write('\n')

@@ -389,7 +389,7 @@
         for line in f:
             it = line.strip().split('\t')
             w.write(it[0] + '\t' + it[1] + '\t' + seqs[it[0]] + '\t' + it[2])
-            for i in xrange(3, len(it)):
+            for i in range(3, len(it)):
                 w.write('; ' + it[i])
             w.write('\n')
     w.close()
--- ngrams.py   (original)
+++ ngrams.py   (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import click as ck
 import pandas as pd
 from aaindex import is_ok
@@ -15,7 +15,7 @@
     seqs = get_sequences()
     ngrams = set()
     for seq in seqs:
-        for i in xrange(len(seq) - length + 1):
+        for i in range(len(seq) - length + 1):
             ngrams.add(seq[i: (i + length)])
     ngrams = list(sorted(ngrams))
     print(ngrams[:100])
--- nn_hierarchical_all.py      (original)
+++ nn_hierarchical_all.py      (refactored)
@@ -65,7 +65,7 @@
 @ck.option('--train', is_flag=True)
 def main(device, org, train):
     global GO_IDS
-    GO_IDS = FUNC_DICT.values()
+    GO_IDS = list(FUNC_DICT.values())
     global go
     go = get_gene_ontology('go.obo')
     global ORG
@@ -366,14 +366,14 @@
     logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
     logging.info('ROC AUC: \t %f ' % (roc_auc, ))
     logging.info('MCC: \t %f ' % (mcc, ))
-    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (
-        f, p, r, roc_auc, mcc))
+    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
+        f, p, r, roc_auc, mcc)))
     # return f
     # logging.info('Inconsistent predictions: %d' % incon)
     # logging.info('Saving the predictions')
     proteins = test_df['proteins']
     predictions = list()
-    for i in xrange(preds_max.shape[0]):
+    for i in range(preds_max.shape[0]):
         predictions.append(preds_max[i])
     df = pd.DataFrame(
         {
@@ -428,7 +428,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -453,19 +453,19 @@
             pr /= p_total
             if pr + rc > 0:
                 f = 2 * pr * rc / (pr + rc)
-                print('%s\t%d\t%f\t%f\t%f' % (
-                    ipro_id, len(labels), f, pr, rc))
+                print(('%s\t%d\t%f\t%f\t%f' % (
+                    ipro_id, len(labels), f, pr, rc)))


 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -486,8 +486,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -508,7 +508,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -555,7 +555,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -570,7 +570,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- nn_hierarchical_network.py  (original)
+++ nn_hierarchical_network.py  (refactored)
@@ -111,7 +111,7 @@
         nb_filters = [16, 32, 64, 128]
         nb_convs = [1, 2, 3, 4]
         nb_dense = [1, 2, 3, 4]
-        for i in xrange(param * 32, param * 32 + 32):
+        for i in range(param * 32, param * 32 + 32):
             dim = i % 4
             i = i / 4
             nb_fil = i % 4
@@ -189,7 +189,7 @@
         embedding_dims,
         input_length=MAXLEN,
         dropout=params['embedding_dropout']))
-    for i in xrange(params['nb_conv']):
+    for i in range(params['nb_conv']):
         model.add(Convolution1D(
             nb_filter=params['nb_filter'],
             filter_length=params['filter_length'],
@@ -289,7 +289,7 @@
     net = merge(
         [feature_model, inputs2], mode='concat',
         concat_axis=1, name='merged')
-    for i in xrange(params['nb_dense']):
+    for i in range(params['nb_dense']):
         net = Dense(params['fc_output'], activation='relu')(net)
     layers = get_layers(net)
     output_models = []
@@ -382,14 +382,14 @@
     logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
     logging.info('ROC AUC: \t %f ' % (roc_auc, ))
     logging.info('MCC: \t %f ' % (mcc, ))
-    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (
-        f, p, r, roc_auc, mcc))
+    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
+        f, p, r, roc_auc, mcc)))
     # return f
     # logging.info('Inconsistent predictions: %d' % incon)
     # logging.info('Saving the predictions')
     proteins = test_df['proteins']
     predictions = list()
-    for i in xrange(preds_max.shape[0]):
+    for i in range(preds_max.shape[0]):
         predictions.append(preds_max[i])
     df = pd.DataFrame(
         {
@@ -444,7 +444,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -469,19 +469,19 @@
             pr /= p_total
             if pr + rc > 0:
                 f = 2 * pr * rc / (pr + rc)
-                print('%s\t%d\t%f\t%f\t%f' % (
-                    ipro_id, len(labels), f, pr, rc))
+                print(('%s\t%d\t%f\t%f\t%f' % (
+                    ipro_id, len(labels), f, pr, rc)))


 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -502,8 +502,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -524,7 +524,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -570,7 +570,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -585,7 +585,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- nn_hierarchical_seq.py      (original)
+++ nn_hierarchical_seq.py      (refactored)
@@ -129,7 +129,7 @@
         return values - mn

     def get_values(data_frame):
-        print(data_frame['labels'].values.shape)
+        print((data_frame['labels'].values.shape))
         labels = reshape(data_frame['labels'].values)
         ngrams = sequence.pad_sequences(
             data_frame['ngrams'].values, maxlen=MAXLEN)
@@ -356,7 +356,7 @@

     model = model.layers[1]
     output = model.predict_generator(test_generator, val_samples=len(test_data))
-    print(output.shape)
+    print((output.shape))
     return
     logging.info('Predicting')
     preds = model.predict_generator(
@@ -379,8 +379,8 @@
     logging.info('Fmax measure: \t %f %f %f %f' % (f, p, r, t))
     logging.info('ROC AUC: \t %f ' % (roc_auc, ))
     logging.info('MCC: \t %f ' % (mcc, ))
-    print('%.3f & %.3f & %.3f & %.3f & %.3f' % (
-        f, p, r, roc_auc, mcc))
+    print(('%.3f & %.3f & %.3f & %.3f & %.3f' % (
+        f, p, r, roc_auc, mcc)))
     # logging.info('Inconsistent predictions: %d' % incon)
     # logging.info('Saving the predictions')
     # proteins = test_df['proteins']
@@ -440,7 +440,7 @@
         rc = 0
         total = 0
         p_total = 0
-        for i in xrange(len(labels)):
+        for i in range(len(labels)):
             tp = np.sum(labels[i] * predictions[i])
             fp = np.sum(predictions[i]) - tp
             fn = np.sum(labels[i]) - tp
@@ -471,13 +471,13 @@

 def function_centric_performance(functions, preds, labels):
     preds = np.round(preds, 2)
-    for i in xrange(len(functions)):
+    for i in range(len(functions)):
         f_max = 0
         p_max = 0
         r_max = 0
         x = list()
         y = list()
-        for t in xrange(1, 100):
+        for t in range(1, 100):
             threshold = t / 100.0
             predictions = (preds[i, :] > threshold).astype(np.int32)
             tp = np.sum(predictions * labels[i, :])
@@ -498,8 +498,8 @@
                 r_max = recall
         num_prots = np.sum(labels[i, :])
         roc_auc = auc(x, y)
-        print('%s %f %f %f %d %f' % (
-            functions[i], f_max, p_max, r_max, num_prots, roc_auc))
+        print(('%s %f %f %f %d %f' % (
+            functions[i], f_max, p_max, r_max, num_prots, roc_auc)))


 def compute_roc(preds, labels):
@@ -520,7 +520,7 @@
     p_max = 0
     r_max = 0
     t_max = 0
-    for t in xrange(1, 100):
+    for t in range(1, 100):
         threshold = t / 100.0
         predictions = (preds > threshold).astype(np.int32)
         total = 0
@@ -566,7 +566,7 @@
 def get_gos(pred):
     mdist = 1.0
     mgos = None
-    for i in xrange(len(labels_gos)):
+    for i in range(len(labels_gos)):
         labels, gos = labels_gos[i]
         dist = distance.cosine(pred, labels)
         if mdist > dist:
@@ -581,7 +581,7 @@
     train_labels = train_df['labels'].values
     train_gos = train_df['gos'].values
     global labels_gos
-    labels_gos = zip(train_labels, train_gos)
+    labels_gos = list(zip(train_labels, train_gos))
     p = Pool(64)
     pred_gos = p.map(get_gos, preds)
     total = 0
--- plots.py    (original)
+++ plots.py    (refactored)
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from __future__ import print_function
+
 import os
 import sys
 import numpy as np
@@ -64,7 +64,7 @@
             index.append(i)
     df = df.iloc[index]
     print(len(df))
-    lens = map(len, df['sequences'])
+    lens = list(map(len, df['sequences']))
     c = 0
     for i in lens:
         if i <= 1002:
@@ -72,7 +72,7 @@
     print(c)
     h = np.histogram(lens, bins=(
         0, 500, 1000, 1500, 2000, 40000))
-    plt.bar(range(5),
+    plt.bar(list(range(5)),
         h[0], width=1, facecolor='green')
     titles = ['<=500', '<=1000', '<=1500', '<=2000', '>2000']
     plt.xticks(np.arange(0.5, 5.5, 1), titles)
--- predict.py  (original)
+++ predict.py  (refactored)
@@ -132,12 +132,12 @@
     # logging.info('Inconsistent predictions: %d' % incon)

     predictions = list()
-    for i in xrange(len(targets)):
+    for i in range(len(targets)):
         predictions.append(preds[i])
     df = pd.DataFrame({
         'targets': targets,
         'predictions': predictions})
-    print(len(df))
+    print((len(df)))
     df.to_pickle(DATA_ROOT + model_name + '_preds_' + FUNCTION + '.pkl')
     logging.info('Done in %d sec' % (time.time() - start_time))

@@ -149,7 +149,7 @@
     for i, row in df.iterrows():
         preds = row['predictions']
         go_ids = list()
-        for i in xrange(len(preds)):
+        for i in range(len(preds)):
             if preds[i] >= threshold:
                 go_ids.append(functions[i])
         gos.append(filter_specific(go, go_ids))
--- predict_all.py      (original)
+++ predict_all.py      (refactored)
@@ -85,11 +85,11 @@
                                 seqs = list()
                                 info = list()
                         else:
-                            print('Ignoring sequence {} because its length > 1002'
-                              .format(inf))
+                            print(('Ignoring sequence {} because its length > 1002'
+                              .format(inf)))
                     else:
-                        print('Ignoring sequence {} because of ambigious AA'
-                              .format(inf))
+                        print(('Ignoring sequence {} because of ambigious AA'
+                              .format(inf)))

                     seq = ''
                 inf = line[1:].split()[0]
@@ -108,7 +108,7 @@
         p = Popen(['diamond', 'blastp', '-d', 'data/embeddings',
                    '--max-target-seqs', '1', '--min-score', '60',
                    '--outfmt', '6', 'qseqid', 'sseqid'], stdin=PIPE, stdout=PIPE)
-        for i in xrange(n):
+        for i in range(n):
             p.stdin.write('>' + str(i) + '\n' + sequences[i] + '\n')
         p.stdin.close()

@@ -119,13 +119,13 @@
                 if len(it) == 2:
                     prot_ids[it[1]] = int(it[0])

-    prots = embed_df[embed_df['accessions'].isin(prot_ids.keys())]
+    prots = embed_df[embed_df['accessions'].isin(list(prot_ids.keys()))]
     for i, row in prots.iterrows():
         embeds[prot_ids[row['accessions']], :] = row['embeddings']

-    for i in xrange(len(sequences)):
+    for i in range(len(sequences)):
         seq = sequences[i]
-        for j in xrange(min(MAXLEN, len(seq)) - gram_len + 1):
+        for j in range(min(MAXLEN, len(seq)) - gram_len + 1):
             data[i, j] = vocab[seq[j: (j + gram_len)]]
     return [data, embeds]

@@ -133,13 +133,13 @@
 def predict(data, model, model_name, functions, threshold, batch_size):
     n = data[0].shape[0]
     result = list()
-    for i in xrange(n):
+    for i in range(n):
         result.append(list())
     predictions = model.predict(
         data, batch_size=batch_size, verbose=1)
-    for i in xrange(n):
+    for i in range(n):
         pred = (predictions[i] >= threshold).astype('int32')
-        for j in xrange(len(functions)):
+        for j in range(len(functions)):
             if pred[j] == 1:
                 result[i].append(model_name + '_' + functions[j] + '|' + '%.2f' % predictions[i][j])
     return result
@@ -157,8 +157,8 @@
     for key, gram in enumerate(ngram_df['ngrams']):
         vocab[gram] = key + 1
         gram_len = len(ngram_df['ngrams'][0])
-    print('Gram length:', gram_len)
-    print('Vocabulary size:', len(vocab))
+    print(('Gram length:', gram_len))
+    print(('Vocabulary size:', len(vocab)))
     threshold = 0.3
     # sequences = ['MKKVLVINGPNLNLLGIREKNIYGSVSYEDVLKSISRKAQELGFEVEFFQSNHEGEIIDKIHRAYFEKVDAIIINPGAYTHYSYAIHDAIKAVNIPTIEVHISNIHAREEFRHKSVIAPACTGQISGFGIKSYIIALYALKEILD']
     # data = get_data(sequences)
@@ -167,7 +167,7 @@
         df = pd.read_pickle('data/models/%s.pkl' % onto)
         functions = df['functions']
         models.append((model, functions))
-        print 'Model %s initialized.' % onto
+        print('Model %s initialized.' % onto)
         # result = predict(data, model, functions, threshold)
         # print result

@@ -180,15 +180,15 @@
     data = get_data(sequences, prot_ids)
     result = list()
     n = len(sequences)
-    for i in xrange(n):
+    for i in range(n):
         result.append([])
     for i in range(len(models)):
         model, functions = models[i]
-        print 'Running predictions for model %s' % funcs[i]
+        print('Running predictions for model %s' % funcs[i])
         res = predict(data, model, funcs[i], functions, threshold, batch_size)
-        for j in xrange(n):
+        for j in range(n):
             result[j] += res[j]
-    print('Predictions time: {}'.format(time.time() - start_time))
+    print(('Predictions time: {}'.format(time.time() - start_time)))
     return result


--- stats.py    (original)
+++ stats.py    (refactored)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import numpy as np
 import pandas as pd
 import click as ck
--- text.py     (original)
+++ text.py     (refactored)
@@ -1,7 +1,7 @@
 #!/usr/bin/env python

-from __future__ import print_function
-from __future__ import absolute_import
+
+
 import sys
 import os
 import pandas as pd
@@ -51,7 +51,7 @@
             items = line.strip().split('\t')
             if items[0] in uni_ids:
                 text_reps[uni_ids[items[0]]] = np.array(
-                    map(float, items[1:]), dtype='float32')
+                    list(map(float, items[1:])), dtype='float32')
     return text_reps


--- tf_utils.py (original)
+++ tf_utils.py (refactored)
@@ -1,6 +1,6 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
+
+
+
 import tensorflow as tf
 import numpy as np

--- utils.py    (original)
+++ utils.py    (refactored)
@@ -87,10 +87,10 @@
                     obj['is_obsolete'] = True
     if obj is not None:
         go[obj['id']] = obj
-    for go_id in go.keys():
+    for go_id in list(go.keys()):
         if go[go_id]['is_obsolete']:
             del go[go_id]
-    for go_id, val in go.iteritems():
+    for go_id, val in go.items():
         if 'children' not in val:
             val['children'] = set()
         for p_id in val['is_a']:
@@ -247,16 +247,16 @@
         else:
             if self.monitor_op(current, self.best):
                 if self.verbose > 0:
-                    print('Epoch %05d: %s improved from %0.5f to %0.5f,'
+                    print(('Epoch %05d: %s improved from %0.5f to %0.5f,'
                           ' saving model to %s'
                           % (epoch, self.monitor, self.best,
-                             current, filepath))
+                             current, filepath)))
                 self.best = current
                 save_model_weights(self.model, filepath)
             else:
                 if self.verbose > 0:
-                    print('Epoch %05d: %s did not improve' %
-                          (epoch, self.monitor))
+                    print(('Epoch %05d: %s did not improve' %
+                          (epoch, self.monitor)))


 class DataGenerator(object):
@@ -275,12 +275,12 @@
         self.has_targets = targets is not None

     def __next__(self):
-        return self.next()
+        return next(self)

     def reset(self):
         self.start = 0

-    def next(self):
+    def __next__(self):
         if self.start < self.size:
             # output = []
             # if self.has_targets:
@@ -304,7 +304,7 @@
             return res_inputs
         else:
             self.reset()
-            return self.next()
+            return next(self)


 if __name__ == '__main__':

oeway · 2019-09-29T08:00:20Z

I found there two additional changes need to made in predict_all.py:

change line 112, p.stdin.write('>' + str(i) + '\n' + sequences[i] + '\n') to

p.stdin.write(bytes('>' + str(i) + '\n' + sequences[i] + '\n', 'utf-8'))

change line 118, it = line.strip().split('\t') to

it = line.decode('utf-8').strip().split('\t')

It would be great if someone can test a python3 version.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update code for Python 3 (Python 2.7 no longer maintained after January 1, 2020) #20

Update code for Python 3 (Python 2.7 no longer maintained after January 1, 2020) #20

bdklahn commented Aug 6, 2019

bdklahn commented Aug 6, 2019 •

edited

Loading

oeway commented Sep 29, 2019

Update code for Python 3 (Python 2.7 no longer maintained after January 1, 2020) #20

Update code for Python 3 (Python 2.7 no longer maintained after January 1, 2020) #20

Comments

bdklahn commented Aug 6, 2019

bdklahn commented Aug 6, 2019 • edited Loading

oeway commented Sep 29, 2019

bdklahn commented Aug 6, 2019 •

edited

Loading