diff --git a/docs/source/tutorials.rst b/docs/source/tutorials.rst index a30eef3e..07a6fcf2 100644 --- a/docs/source/tutorials.rst +++ b/docs/source/tutorials.rst @@ -28,4 +28,5 @@ These tutorials use example data provided in the github-repository tutorials/ancestral tutorials/clock tutorials/mugration - tutorials/homoplasy \ No newline at end of file + tutorials/homoplasy + tutorials/arg \ No newline at end of file diff --git a/docs/source/tutorials/arg.rst b/docs/source/tutorials/arg.rst new file mode 100644 index 00000000..24d23306 --- /dev/null +++ b/docs/source/tutorials/arg.rst @@ -0,0 +1,46 @@ + +Using Recombination Event Knowledge to Improve Time Tree Inference +------------------------------------------------------------------ + +Although relatively uncommon, recombination can be a driver of pathogen evolution. However, recombining segments are often not used together in phylogenetic inference due to computational challenges. +Because of this, typically, segments of the genome with little to no recombination are used individually to infer the pathogen phylogeny, leading to a loss of information. +As recombination is an uncommon event most segments will have portions of their phylogenies that show a great deal of overlap. Segments share evolutionary history in these areas of overlap and this knowledge can +be used to improve divergence time estimates in TreeTime. + +To infer branch length TreeTime makes the assumption that the number of mutations on a branch of length :math:`\tau` is Poisson distributed + +.. math:: + n_{mut} \sim Pois(\mu \tau). + +Where :math:`\mu` is the mutation rate. +The variance and mean number of expected mutations on a branch of length :math:`\tau` is :math:`\mu \tau`. +If :math:`\mu` is known this relation can be used to estimate the branch length given the number of seen mutations, + +.. math:: + \tau^{infer} = \frac{n_{mut}}{\mu}, + +this estimator has expectation :math:`\tau`` and variance :math:`\frac{\tau}{\mu}`. +When this is repeated :math:`L` times (i.e. for a sequence of :math:`L` nucleotides, each with mutation rate :math:`\mu`), +the average number of mutations is normally distributed with mean :math:`\mu \tau` and standard error :math:`\frac{\mu \tau}{\sqrt{L}}`. +Assuming we know that a branch is shared between two segments we can use the alignment of both segments on this branch to estimate divergence time of this branch, decreasing the standard error. + +`TreeKnit `_ is a package that can infer recombination events from tree topologies. It returns lists of leaves that are connected by shared branches in pairs of trees. +These leaves can be used to determine so called maximally compatible clades (MCCs), or clades where topology is shared across trees. +If desired TreeKnit additionally returns trees that have been resolved according to each other. + +This output can be used in TreeTime to improve the inference of time trees and in turn improve the ancestral sequence reconstruction and the clock tree inference. +The ``treetime arg`` command uses input trees and their corresponding alignments to infer time trees. It is assumed the list of MCCs are in json format as described in `TreeKnit `_ . +For each tree, the list of maximally compatible clades with every other tree is used to determine if internal nodes are part of a MCC with another tree and if they are, which MCC they belong to. +This is done using the `Fitch algorithm `_ (function: ``assign_all_mccs``). If a node and it's parent both belong to the same MCC then the branch between them is shared. +Take for example three trees of segments A, B and C. If a branch is shared between trees of segments A and B then the alignment of both segment A and B can be used to infer the divergence time of the branch +(the alignment of segment C is masked at this position), if the branch is only in tree A the alignment of segment B and C will be masked. If a branch is shared between the segments A, B and C then alignments A, B and C can be used, +leading to more accurate branch length estimates than if only the alignment of segment A was used to infer the divergence time of that branch. + +In the test folder there is an example of a standard TreeKnit output for three trees. TreeTime expects recombination information to be in `TreeKnit output format `_. This can be used to run the ``treetime arg`` command: + +.. code:: bash + + treetime arg --trees arg/TreeKnit/tree_a_resolved.nwk arg/TreeKnit/tree_b_resolved.nwk arg/TreeKnit/tree_c_resolved.nwk --alignments arg/TreeKnit/aln_a.fasta arg/TreeKnit/aln_b.fasta arg/TreeKnit/aln_c.fasta --mccs arg/TreeKnit/MCCs.json --dates arg/TreeKnit/metadata.csv --clock-rate 0.0028 --outdir time_tree_arg_results + +For each tree treetime will output ancestral sequence reconstructions, dates of the tree nodes, as well as time tree and divergence trees for each input tree using information from other trees for shared branches. +The output will be written to the folder ``time_tree_arg_results``. diff --git a/test/arg/TreeKnit/MCCs.json b/test/arg/TreeKnit/MCCs.json new file mode 100644 index 00000000..c740446e --- /dev/null +++ b/test/arg/TreeKnit/MCCs.json @@ -0,0 +1,22 @@ +{ "MCC_dict" : { +"1": { + "trees":["tree_a", "tree_b"], +"mccs": [["3_0"], +["10_0", "4_0"], +["5_0", "8_0"], +["1_0", "2_0", "6_0", "7_0", "9_0"]] +}, +"2": { + "trees":["tree_a", "tree_c"], +"mccs": [["3_0"], +["10_0", "4_0"], +["5_0", "8_0"], +["1_0", "2_0", "6_0", "7_0", "9_0"]] +}, +"3": { + "trees":["tree_b", "tree_c"], +"mccs": [["1_0", "2_0", "6_0"], +["10_0", "3_0", "4_0", "5_0", "7_0", "8_0", "9_0"]] +} +} +} \ No newline at end of file diff --git a/test/arg/TreeKnit/aln_a.fasta b/test/arg/TreeKnit/aln_a.fasta new file mode 100644 index 00000000..32e645a6 --- /dev/null +++ b/test/arg/TreeKnit/aln_a.fasta @@ -0,0 +1,160 @@ +>1_0 +AATTCGACAAGACAATCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGACCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTCGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTCGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGAGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCC +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCTGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCACAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>6_0 +AATTCGACAAGACAATCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGACCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTCGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTCGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGAGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCC +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCTGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCACAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>2_0 +AATTCGACAAGACAATCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGACCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTCGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTCGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGAGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCC +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCTGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCACAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>8_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTCGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCATTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTTCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTCGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCC +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCACAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>5_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTCGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTTCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTCGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCC +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCACAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>7_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTAGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTTGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCG +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCGCAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>4_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCT +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTAGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTTGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCG +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCGCAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>10_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCT +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTAGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTTGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCG +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCGCAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>3_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTAGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCTTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAC +AAAGTGATCTGCACTGCGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTTGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATGGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCG +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCGCAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG +>9_0 +AATTCGACAAGACACTCTCCATACGCTATTGGATGGATTCTCGAGGGTACGTGTATGCGTGTCCAACCCA +CCCATAAACGCGTCGAGAATTATGACCGATAGTAGATGGCCCCGTACCGGATCGGTGGGGGCTTAACCGG +AGCTTACTCGCTGACCACTTCTGGCCGTCATCCAAGGTTAAAAACCACGCCACCCAGCACGAAACATATC +GTTAGCGCCCACTGTAGAGGTAGGTGCTCGAGAAGGGTATTTCTTCGCGTTTTCTATTCCCCAAGAATTC +GGCCTTGGAACTGGTTGTCCACTACTCGAGGTTCACCCCCTGACTGGATTTTTTGACCTGTTGTCCCGAG +GCCTCCGCCCATGTACGAGTGGCTTCCTAGATAGTGTCCTAACCCTCTGCACTAGTAGAGTGAGCCGCCC +CATCAATCAACCACAAATATAAAGAAGACTCCTTAATTCTGACACGTTCCGCTCCGCATCATAAGAAAAC +TTTGATCAATCTGTTCTCGTCTCGAGACCGCCAGTAATAAACGGACACCCCTTTATGGCCGTTGTTTGAT +CAAGTGATCTGCACTACGAAAAATTTCGCCAACCGATCTCCGGTCCGGCCACTACACGTAGAAGCGTTTC +ATCACACGACATGAATGAGTAGCCCTACCGTGGGCATCTTTGTTTGTCCGCTCAGCCTGATCCGCGGGTC +AGGGCGGAACAGTATTGTTGGACTCCTTTCGGTGCGGGCATACCTAGTTTTCCTGGGTTAGAAACTTTCC +CCAATCCGTTGTTAGAGCCATCTTTAGAACAGTCCAGCGGTCGTGGCGCGGAAAAGTGTTGAGGGCTGCC +GCGGTGACGGATCCTCACTCTTCGCAGAGCTCCGGTTAGCCCAGTATGAACCTCATAAACCGTCTAAGAA +TCGGTTCCGCATACGACTGAGGGCTGCGCAGTAAGCACATAATATAGCGTTGTCAATACAGATACGAATA +CCTGGTCTACGCCGCAAGAG diff --git a/test/arg/TreeKnit/aln_b.fasta b/test/arg/TreeKnit/aln_b.fasta new file mode 100644 index 00000000..2e74a1a1 --- /dev/null +++ b/test/arg/TreeKnit/aln_b.fasta @@ -0,0 +1,160 @@ +>9_0 +ACATTTTGCGTGGGCTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGCTAGAATGAGTAG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>4_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGCTAGAATGAGTAG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGGCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>10_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGCTAGAATGAGTAG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGGCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>7_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAACCGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACACGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGCTAGAATGAGTAG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGCCATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>8_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGTTAGAATGAGTGG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAGCACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>5_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGAGTTAGAATGAGTGG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCAGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGCGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>3_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACGCAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGTTAGAATGAGTGG +TGGCGTCGTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAACAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAACGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>1_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACACAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGTTAGAATGAGTGG +TGGCGTCCTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAAGGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>6_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACACAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGTTAGAATGAGTGG +TGGCGTCCTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAAGGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC +>2_0 +ACATTTTGCGTGGACTAAGAGGTGACAATCCTCTGCAAAGTCGAGGCGGGTTTACACAGCGAGCCTACGT +GACACATCGTGCGTTGGACGCCTTAGTTATAGAGCCACTGCATGCGCACTGAATCCCGTTATGGTAAGGG +TCAGACCCTGCTATACCTAACCAACGCCGACACAGAGGGTGTGGCCGTGAAATAACTATATAGTTACTTA +GGGTCGATCTCACCATACCAAACGTGGGCACACTACGTGCAAATGAGCGTCCTTTGTAGGACTAAAATAC +TTCATTTTATTGACACAAACGGGGCCCTTGACATAGGCACTCAAGAATCGGTCCCCACGCGCAGTCGCCT +ACTCATACACTAAGGACGCGCTATATAACGTAAATGTGAGGATCGGCTTATGCCGCGTTAGAATGAGTGG +TGGCGTCCTGGATGCGAATGCTTCGAAGCTGCTGGGATAGGGCGGGCGATATAACGTGTTGCTACCGGTT +GGTCGGTGATTGGGGGTGTGGCCTTGTGGAATAGTCAACCGCAAGCGCCATCGCCAAGAGGTAGCTGTTG +ACAGGAATCATAGACATCCTTCATGGGATGTTTCAAATTAGCCCTGTGCATAACACTACGACTCGAATGT +GGCTTAACATTGGTGAACCAGAACCGTCGGACTCATCAGTTGCTCCTCACGGTCTGTCGCCACCACACGC +AGCACTTGAAATAAATCGTTAACTTTAGGGGGCTGGATGTGCCCTGCATGGGGCAGTGCGGGGCTGAGCG +TCCTGAAAGCAAACAATCGGCAGTCTCCTCATGGTCCACCGCCAGACCTTGCCAGAACAACAAGGCCTTC +CCTTACCTATCGAGAGGGTAGTAGACATGGCCAGGAGCGCAGCCAGTGGTCGGGCGCCGGTCATGCGCTT +CTTTATTCCGAACTTGGACCGCGATTATTTCGCTCTTCCTGAATAGCGATTTATTTGATCGCCAGTAATC +ACATCATTGGGGAATGTTTC diff --git a/test/arg/TreeKnit/aln_c.fasta b/test/arg/TreeKnit/aln_c.fasta new file mode 100644 index 00000000..6638cd99 --- /dev/null +++ b/test/arg/TreeKnit/aln_c.fasta @@ -0,0 +1,160 @@ +>9_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGCCTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCGAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCACATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>4_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACCGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGCCTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCACATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>10_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGCCTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCACATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>7_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTGTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGCCTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCACATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGGCCGA +>8_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGACTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCGCATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACACCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCCGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGCGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>5_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGACTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCGCATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCCGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGCGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>3_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGAGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGACTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCGCATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTACAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACTTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCCGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGCGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>1_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGTGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGACTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCGCATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTAGAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACGTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>6_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGTGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGACTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCGCATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTAGAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACGTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA +>2_0 +ACCAAGTCGGTGTTGGGCGTAGGGCTGTGCTCCCAGTTTTTTACCTCAAAATAATTGGGGAATTGGATAT +TAACTATGCAAAGTAGTTCTAAAGTTTCGAATTCTTAGCCGCGTCAGAACGTAATGGCAAGTTGTCAGTA +TAGAGACAGCCTATCGCCCCCCGGAGTTTCGCGACGTATAGAAGGTGAGTCTAATTGGTCCAGTCCTAAA +ATGCTTCGCTGACTCTGCCTGACTATGCAGGTGATCTAAAACAAAATTAACTACCGGCGCGCATTAAGAG +ATAAACATCAGCGAGAGTGGGCGCACCGGAGTCGCATCGATACAGTCAAGACCGGTTATTTTTCATCGGA +GCACCGGACCAGCCAATATGCGCCTGTGCGCTCATGGACCTGCCTCTTCTTAAGCCTCCACGGTATGGGT +TAAGTGTGCCTTCGCGGCCTCATAGGCTTACGCCGGGCATACTGACATGACTTTAGTGTCACGGCCGCAT +ACTCGCCTTTCGCAAGGCGGCTTAGCGGGCTGACTCTAGTTAATATACGTTCCCTCAAGCTAAAATATCA +CCAATGCAAAAGGTGCCATGCGTACCATTAATGCTACTTCCTGCAAAGGCCCCTCCATCGAGAATGCTCT +TCCTATATTAGAGTCGCATGAACTCGCCTTGTTAGATGAGTACTTCAGTCAGCCTAGAAACCCCTCGCGT +AGCCGCAGAACCATTGTGGCCACTGATCGCGGTTCAAACGTTGGTCTGATCGCAACTCAGTCGACAGCTT +AAACGGGTCAGGCGGACAGGCCCCACTGGTGCGGTTCTCAGAGGAGACCTAAGACGTTGTACCATACAAG +AATTCGGAGTAAGACGTGCGGACGAGTCTGATCCGGAAGCACCGATTAATTCTGAGGGGTAAGGGAGATT +CCACCTTCCGGAACCTCCGCAGAGGTCGCGCGTTACGGGATCTGGCACTGCACGCGTTTATTGCTTTTTT +ATGTCGTGTTCGCGGACCGA diff --git a/test/arg/TreeKnit/metadata.csv b/test/arg/TreeKnit/metadata.csv new file mode 100644 index 00000000..888cf3bb --- /dev/null +++ b/test/arg/TreeKnit/metadata.csv @@ -0,0 +1,11 @@ +name,date +1_0,2000.01 +6_0,2000.01 +2_0,2000.01 +8_0,2000.01 +5_0,2000.01 +7_0,2000.01 +4_0,2000.01 +10_0,2000.01 +3_0,2000.01 +9_0,2000.01 diff --git a/test/arg/TreeKnit/tree_a_resolved.nwk b/test/arg/TreeKnit/tree_a_resolved.nwk new file mode 100644 index 00000000..83856060 --- /dev/null +++ b/test/arg/TreeKnit/tree_a_resolved.nwk @@ -0,0 +1 @@ +(((1_0:0.00042160595317589734,(6_0:1.0808942097513535e-5,2_0:1.0808942097513535e-5)internal_1:0.00041079701107838383)internal_4:0.004424706951683469,(8_0:0.0008463505955400933,5_0:0.0008463505955400933)internal_9:0.003999962309319273)internal_14:0.00039448806504147895,(((7_0:0.000593105927716494,(4_0:3.1975715213597586e-5,10_0:3.1975715213597586e-5)internal_2:0.0005611302125028965)internal_6:5.255445800663714e-5,3_0:0.0006456603857231312)internal_7:0.0007718518012316954,9_0:0.0014175121869548265)internal_11:0.003823288782946018)internal_15:0; \ No newline at end of file diff --git a/test/arg/TreeKnit/tree_b_resolved.nwk b/test/arg/TreeKnit/tree_b_resolved.nwk new file mode 100644 index 00000000..c0507fc4 --- /dev/null +++ b/test/arg/TreeKnit/tree_b_resolved.nwk @@ -0,0 +1 @@ +((((9_0:0.0007157734425760744,(4_0:3.1975715213597586e-5,10_0:3.1975715213597586e-5)internal_2:0.0006837977273624769)internal_8:0.0007017387443787521,7_0:0.0014175121869548265)internal_11:0.0024284388519823638,((8_0:0.0008463505955400933,5_0:0.0008463505955400933)internal_9:5.698925281919542e-5,3_0:0.0009033398483592889)internal_10:0.0029426111905779015)internal_13:0.0013948499309636548,(1_0:0.00042160595317589734,(6_0:1.0808942097513535e-5,2_0:1.0808942097513535e-5)internal_1:0.00041079701107838383)internal_4:0.004819195016724947)internal_15:0; \ No newline at end of file diff --git a/test/arg/TreeKnit/tree_c_resolved.nwk b/test/arg/TreeKnit/tree_c_resolved.nwk new file mode 100644 index 00000000..c0507fc4 --- /dev/null +++ b/test/arg/TreeKnit/tree_c_resolved.nwk @@ -0,0 +1 @@ +((((9_0:0.0007157734425760744,(4_0:3.1975715213597586e-5,10_0:3.1975715213597586e-5)internal_2:0.0006837977273624769)internal_8:0.0007017387443787521,7_0:0.0014175121869548265)internal_11:0.0024284388519823638,((8_0:0.0008463505955400933,5_0:0.0008463505955400933)internal_9:5.698925281919542e-5,3_0:0.0009033398483592889)internal_10:0.0029426111905779015)internal_13:0.0013948499309636548,(1_0:0.00042160595317589734,(6_0:1.0808942097513535e-5,2_0:1.0808942097513535e-5)internal_1:0.00041079701107838383)internal_4:0.004819195016724947)internal_15:0; \ No newline at end of file diff --git a/test/command_line_tests.sh b/test/command_line_tests.sh index cdaa73a3..f5c89561 100755 --- a/test/command_line_tests.sh +++ b/test/command_line_tests.sh @@ -54,6 +54,15 @@ else echo "timetree_inference on vcf data failed $retval" fi +treetime arg --trees arg/TreeKnit/tree_a_resolved.nwk arg/TreeKnit/tree_b_resolved.nwk arg/TreeKnit/tree_c_resolved.nwk --alignments arg/TreeKnit/aln_a.fasta arg/TreeKnit/aln_b.fasta arg/TreeKnit/aln_c.fasta --mccs arg/TreeKnit/MCCs.json --dates arg/TreeKnit/metadata.csv --clock-rate 0.0028 --outdir time_tree_arg_results +retval="$?" +if [ "$retval" == 0 ]; then + echo "timetree arg on 3 trees ok" +else + ((all_tests++)) + echo "timetree arg on 3 trees failed $retval" +fi + treetime --tree treetime_examples/data/ebola/ebola.nwk --dates treetime_examples/data/ebola/ebola.metadata.csv --aln treetime_examples/data/ebola/ebola.fasta --coalescent skyline --gen-per-year 100 retval="$?" if [ "$retval" == 0 ]; then diff --git a/test/test_arg.py b/test/test_arg.py new file mode 100644 index 00000000..f8aad8ba --- /dev/null +++ b/test/test_arg.py @@ -0,0 +1,88 @@ +# Test the arg functions on multiple trees +def test_reading_in_TK_output(): + import treetime.arg + tree_nwk_files = ['test/arg/TreeKnit/tree_a_resolved.nwk','test/arg/TreeKnit/tree_b_resolved.nwk', 'test/arg/TreeKnit/tree_c_resolved.nwk'] + tree_names = ["tree_a", "tree_b", "tree_c"] + assert treetime.arg.get_tree_names(tree_nwk_files) == tree_names + MCC_dict = treetime.arg.get_MCC_dict('test/arg/TreeKnit/MCCs.json') + + # read trees, assert trees are named consistently + tree_names = ["tree_a", "tree_b", "tree_c"] + trees_in_dict = set().union(*MCC_dict.keys()) + assert all([k in trees_in_dict for k in tree_names])== True + +test_reading_in_TK_output() +def test_assign_mccs(): + from Bio import Phylo + from treetime import TreeTime + from treetime import arg + from treetime.utils import parse_dates + + tree = Phylo.read('test/arg/TreeKnit/tree_a_resolved.nwk', 'newick') + tt = TreeTime(dates=parse_dates("test/arg/TreeKnit/metadata.csv"), tree=tree, + aln="test/arg/TreeKnit/aln_a.fasta", gtr='JC69', alphabet='nuc', verbose=True, + fill_overhangs=True, keep_node_order=True, + compress=False) + + # make a lookup for the MCCs and assign to tree + MCC_dict = arg.get_MCC_dict('test/arg/TreeKnit/MCCs.json') + MCC_locs = [frozenset(["tree_a", "tree_b"]), frozenset(["tree_a", "tree_c"])] + MCCs = [MCC_dict[loc] for loc in MCC_locs] + leaf_to_MCC = arg.get_mcc_map(MCCs) + assert leaf_to_MCC == {'3_0': [0, 0], '10_0': [1, 1], '4_0': [1, 1], '5_0': [2, 2], '8_0': [2, 2], '1_0': [3, 3], '2_0': [3, 3], '6_0': [3, 3], '7_0': [3, 3], '9_0': [3, 3]} + arg.assign_all_mccs(tt.tree, len(MCCs), leaf_to_MCC, tt.one_mutation) + for node in tt.tree.find_clades(): + assert node.mcc[0] == node.mcc[1] + if set([c.name for c in node.clades]) == set(["4_0", "10_0"]): + node.mcc == [1,1] + if set([c.name for c in node.clades]) == set(["8_0", "5_0"]): + node.mcc == [2,2] + +test_assign_mccs() + +def test_parse_args(): + from treetime import arg + import numpy as np + + tree_nwk_files = ['test/arg/TreeKnit/tree_a_resolved.nwk','test/arg/TreeKnit/tree_b_resolved.nwk', 'test/arg/TreeKnit/tree_c_resolved.nwk'] + aln_files = ['test/arg/TreeKnit/aln_a.fasta','test/arg/TreeKnit/aln_b.fasta', 'test/arg/TreeKnit/aln_c.fasta'] + MCC_file = 'test/arg/TreeKnit/MCCs.json' + + dict_ = arg.parse_arg(tree_nwk_files, aln_files, MCC_file, fill_overhangs=True) + assert sum(dict_["masks_dict"][frozenset(["tree_a"])]) == sum(dict_["masks_dict"][frozenset(["tree_b"])]) == sum(dict_["masks_dict"][frozenset(["tree_c"])]) ==1000 + assert all(dict_["masks_dict"][frozenset(["tree_a"])] == np.concatenate((np.ones(1000), np.zeros(2000)))) + assert all(dict_["masks_dict"][frozenset(["tree_a", "tree_b"])] == np.concatenate((np.ones(2000), np.zeros(1000)))) + assert all(dict_["masks_dict"][frozenset(["tree_a", "tree_b", "tree_c"])] == np.ones(3000)) + +test_parse_args() + +def test_setup_arg(): + from treetime import arg + from treetime.utils import parse_dates + import numpy as np + + tree_nwk_files = ['test/arg/TreeKnit/tree_a_resolved.nwk','test/arg/TreeKnit/tree_b_resolved.nwk', 'test/arg/TreeKnit/tree_c_resolved.nwk'] + aln_files = ['test/arg/TreeKnit/aln_a.fasta','test/arg/TreeKnit/aln_b.fasta', 'test/arg/TreeKnit/aln_c.fasta'] + MCC_file = 'test/arg/TreeKnit/MCCs.json' + dates = parse_dates("test/arg/TreeKnit/metadata.csv") + + dict_ = arg.parse_arg(tree_nwk_files, aln_files, MCC_file, fill_overhangs=True) + + ##check if arg is set up correctly on tree_b + masked_tree_b = arg.setup_arg("tree_b", dict_["trees_dict"], dict_["alignment"], dates, dict_["MCCs_dict"], dict_["masks_dict"], gtr='JC69', + verbose=0, fill_overhangs=True, reroot=False, fixed_clock_rate=0.001, alphabet='nuc') + + node_dict = {} + for node in masked_tree_b.tree.find_clades(): + node_dict[node.name] = node + for node in masked_tree_b.tree.find_clades(): + if node.name == "3_0" or set([c.name for c in node.clades]) == set(["4_0", "10_0"]) or set([c.name for c in node.clades]) == set(["3_0", "internal_9"]) or set([c.name for c in node.clades]) == set(["8_0", "5_0"]): + assert all(node.mask == np.concatenate((np.zeros(1000), np.ones(2000)))) + elif node.name in set([c.name for c in masked_tree_b.tree.root.clades]): + assert all(node.mask == np.concatenate((np.ones(2000), np.zeros(1000)))) + elif not node.up: + assert all(node.mask == np.concatenate((np.zeros(1000), np.ones(1000), np.zeros(1000)))) + else: + assert all(node.mask == np.ones(3000)) + +test_setup_arg() diff --git a/treetime/arg.py b/treetime/arg.py index 3d82008b..55bc9cab 100644 --- a/treetime/arg.py +++ b/treetime/arg.py @@ -1,41 +1,147 @@ +from ctypes import alignment from matplotlib.pyplot import fill import numpy as np +import json +import itertools +from os import path +import random -def parse_arg(tree1, tree2, aln1, aln2, MCC_file, fill_overhangs=True): +def get_tree_names(tree_nwk_files): + ''' + Input: string list of `.nwk` file PATH locations + + Returns tree names from `.nwk` file names (using TreeKnit standard) + ''' + tree_names = [] + for file in tree_nwk_files: + file_name = path.splitext(path.basename(file))[0] + file_name = file_name.replace("_resolved", "").replace("resolved", "") + tree_names.append(file_name) + if len(set(tree_names)) != len(tree_nwk_files): + #tree name identifiers are not unique + raise Exception("Error: Tree names must be unique, see TreeKnit output format.") + return tree_names + +def get_MCC_dict(MCC_file): + ''' + Read in MCCs from TreeKnit .json output file + + Returns: + ------- + MCC_dict : dict{frozenset(str), list(str)} + key : frozenset of tree name pairs + item : list of lists of leaf names + two leaves are in the same list if they are in a maximally compatible + clade, meaning in that tree pair there was no recombination in their subclade + + ''' + f = open(MCC_file) + data = json.load(f) + MCC_dict = {} + for key in data["MCC_dict"]: + MCC_dict[frozenset(data["MCC_dict"][key]["trees"])] = data["MCC_dict"][key]["mccs"] + + return MCC_dict + +def get_mask_dict(length_segments, tree_names): + """Create alignment masks for tree branches corresponding to which trees + share this branch. + + Parameters + ---------- + length_segments : int list + length of segment in each tree + tree_names : str list + name of each corresponding tree + + Returns + ------- + mask_dict: dictionary + key is a frozenset of tree names, items are boolean masks of length + len(joint alignment), positions in the mask are only 1 if they + correspond to positions in the segments of the trees given + tree_segment_positions : dictionary + start and end position of each segment's position in the combined alignment + """ + #list of start positions (or end positions +1) of each segment + pos_list = [0] + for l in length_segments: + new = pos_list[-1] + l + pos_list.append(new) + #create dictionary of start and end position of each segment in alignment + tree_segment_positions = {} + for i in range(len(tree_names)): + tree_segment_positions[tree_names[i]] = (pos_list[i], pos_list[i+1]) + + #mask dictionary + mask = {} + no_trees = len(tree_names) + for r in range(1,(no_trees+1)): + #all combinations of at least one tree + combos = itertools.combinations(range(1, (no_trees+1)), r) + for comb in combos: + new_mask = np.zeros(sum(length_segments)) + for c in comb: + new_mask[pos_list[c-1]:pos_list[c]] = 1 + mask[frozenset([tree_names[c-1] for c in comb])] = new_mask + return mask, tree_segment_positions + +def parse_arg(tree_files, aln_files, MCC_file, fill_overhangs=True): """parse the output of TreeKnit and return a file structure to be further consumed by TreeTime - Args: - tree1 (str): file name of tree1 - tree2 (str): file name of tree2 - aln1 (str): file name of alignment 1 - aln2 (str): file name of alignment 2 - MCC_file (str): name of mcc file - fill_overhangs (bool, optional): fill terminal gaps of alignmens before concatenating. Defaults to True. + Parameters + ---------- + tree_files : str list + file names of trees + aln_files : str list + file names of alignments MUST be in the same order as tree_files + MCC_file : str + name of mcc file + fill_overhangs : bool, optional + fill terminal gaps of alignmens before concatenating. Defaults to True. - Returns: - dict: dictionary containing the two trees, the concatenated alignment, full and segment masks, and the MCCs + Returns + ---------- + dict: dictionary containing + dict["MCCs_dict"]: MCCs dictionary + key is frozenset of tree names, items are a list of leaf name lists + dict["trees_dict"] : tree dictionary + key is the tree name + dict["alignment"] :MultipleSeqAlignment + the concatenated alignment + dict["masks_dict"] : mask dictionary + key is the tree name + dict["seg_pos_dict"] : dictionary + start and end position of each tree's sequence in dict["alignment"] """ from Bio import Phylo, AlignIO, Seq from Bio.Align import MultipleSeqAlignment from treetime.seq_utils import seq2array - # read trees and determine common terminal nodes - t1 = Phylo.read(tree1, 'newick') - t2 = Phylo.read(tree2, 'newick') - all_leaves = set.intersection(set([x.name for x in t1.get_terminals()]), set([x.name for x in t2.get_terminals()])) - # read MCCs as lists of taxon names - MCCs = [] - with open(MCC_file) as fh: - for line in fh: - if line.strip(): - MCCs.append(line.strip().split(',')) + MCC_dict = get_MCC_dict(MCC_file) + + # read trees, assert trees are named consistently + tree_names = get_tree_names(tree_files) + trees_in_dict = set().union(*MCC_dict.keys()) + assert(all([k in trees_in_dict for k in tree_names])) + trees_dict = {} + for i in range(0, len(tree_files)): + trees_dict[tree_names[i]] = Phylo.read(tree_files[i], 'newick') + + # determine common terminal nodes + all_leaves = set.intersection(*[set([x.name for x in t.get_terminals()]) for (k, t) in trees_dict.items()]) # read alignments and construct edge modified sequence arrays - a1 = {s.id:s for s in AlignIO.read(aln1, 'fasta')} - a2 = {s.id:s for s in AlignIO.read(aln2, 'fasta')} - for aln in [a1,a2]: + alignments = [] + alignment_lengths = [] + for aln_fname in aln_files: + aln = AlignIO.read(aln_fname, 'fasta') + alignment_lengths.append(aln.get_alignment_length()) + alignments.append({s.id:s for s in aln}) + + for aln in alignments: for s,seq in aln.items(): seqstr = "".join(seq2array(seq, fill_overhangs=fill_overhangs)) seq.seq = Seq.Seq(seqstr) @@ -43,107 +149,148 @@ def parse_arg(tree1, tree2, aln1, aln2, MCC_file, fill_overhangs=True): # construct concatenated alignment aln_combined = [] for leaf in all_leaves: - seq = a1[leaf] + a2[leaf] + concat_seq = alignments[0][leaf] + for a in range(1, len(alignments)): + concat_seq += alignments[a][leaf] + seq = concat_seq seq.id = leaf aln_combined.append(seq) - # construct masks for the concatenation and the two segments - l1 = len(a1[leaf]) - l2 = len(a2[leaf]) - combined_mask = np.ones(l1 + l2) - mask1 = np.zeros(l1 + l2) - mask2 = np.zeros(l1 + l2) - mask1[:l1] = 1 - mask2[l1:] = 1 + # construct masks for the concatenated alignment + masks, segment_positions = get_mask_dict(alignment_lengths, tree_names) - return {"MCCs": MCCs, "trees":[t1,t2], "alignment":MultipleSeqAlignment(aln_combined), - "masks":[mask1,mask2], "combined_mask":combined_mask} + return {"MCCs_dict": MCC_dict, "trees_dict":trees_dict, "alignment":MultipleSeqAlignment(aln_combined), + "masks_dict":masks, "seg_pos_dict":segment_positions} -def setup_arg(T, aln, total_mask, segment_mask, dates, MCCs, gtr='JC69', + + +def setup_arg(tree_name, trees_dict, aln, dates, MCCs_dict, masks_dict, gtr='JC69', verbose=0, fill_overhangs=True, reroot=True, fixed_clock_rate=None, alphabet='nuc', **kwargs): """construct a TreeTime object with the appropriate masks on each node for branch length optimization with full or segment only alignment. - Args: - T (str, Bio.Phylo.Tree): tree of focal segment - aln (Bio.Align.MultipleSeqAlignment): Concatenated multiple sequence alignment - total_mask (np.array): boolean array that is true for the entire sequence - segment_mask (np.array): boolean array that is true only for the focal segment - dates (dict): sampling dates - MCCs (list): list of MCCs - gtr (str, optional): GTR model. Defaults to 'JC69'. - verbose (int, optional): verbosity. Defaults to 0. - fill_overhangs (bool, optional): treat terminal gap as missing. Defaults to True. - reroot (bool, optional): reroot the tree. Defaults to True. + Parameters: + --------- + tree_name : str + name of focal segment / tree + trees_dict : dictionary{str, Bio.Phylo.Tree} + key is tree_name + aln : Bio.Align.MultipleSeqAlignment): + Concatenated multiple sequence alignment + dates : dict + sampling dates + MCCs_dict : dictionary{frozenset(str), list(str)} + key frozenset of tree_names, item MCC as str of leaf nodes + gtr : str, optional + GTR model. Defaults to 'JC69'. + verbose : int, optional + verbosity. Defaults to 0. + fill_overhangs : bool, optional + treat terminal gap as missing. Defaults to True. + reroot : bool, optional + reroot the tree. Defaults to True. Returns: + -------- TreeTime: TreeTime instance """ from treetime import TreeTime + T= trees_dict[tree_name] ##desired tree + ##get list of MCCs of all other trees with T and the order of these trees + MCCs = [] + other_tree_order = {} + i =0 + for t in trees_dict.keys(): + if t != tree_name: + other_tree_order[i] = t + MCCs.append(MCCs_dict[frozenset([tree_name, t])]) + i +=1 + num_other_trees = len(other_tree_order.keys()) + tt = TreeTime(dates=dates, tree=T, aln=aln, gtr=gtr, alphabet=alphabet, verbose=verbose, fill_overhangs=fill_overhangs, keep_node_order=True, compress=False, **kwargs) - if reroot: tt.reroot("least-squares", force_positive=True, clock_rate=fixed_clock_rate) # make a lookup for the MCCs and assign to tree - leaf_to_MCC = {} - for mi,mcc in enumerate(MCCs): - for leaf in mcc: - leaf_to_MCC[leaf] = mi + leaf_to_MCC = get_mcc_map(MCCs) - assign_mccs(tt.tree, leaf_to_MCC, tt.one_mutation) + assign_all_mccs(tt.tree, num_other_trees, leaf_to_MCC, tt.one_mutation) # assign masks to branches whenever child and parent are in the same MCC for n in tt.tree.find_clades(): - if (n.mcc is not None) and n.up and n.up.mcc==n.mcc: - n.mask = total_mask - else: - n.mask = segment_mask + shared = [(n.mcc[other_tree] is not None) and n.up and n.up.mcc[other_tree]==n.mcc[other_tree] + for other_tree in range(num_other_trees)] + ##use tree_order to convert position in MCC list to tree_names and see which trees share this branch and assign a proper mask + branch_shared = [other_tree_order[i] for i, x in enumerate(shared) if x] + branch_shared.append(tree_name) ##branch is always in tree_name + n.mask = masks_dict[frozenset(branch_shared)] return tt -def assign_mccs(tree, mcc_map, one_mutation=1e-4): - """Assign MCCs to all terminal and internal branches of the tree. - - Args: - tree (Bio.Phylo.Tree): tree - mcc_map (dict): map from leaf to mcc - one_mutation (float, optional): minimal length of branches. Defaults to 1e-4. +def get_mcc_map(MCCs_list, shuffle=False): """ - # assign MCCs to leaves + Make a lookup for the MCCs and assign to trees. + Each leaf will be assigned a list of mcc clades in the order of `MCCs_list`. + """ + leaf_to_MCC = {} + for MCCs in MCCs_list: + mcc_index = list(range(len(MCCs))) + if shuffle: + random.seed(987) + random.shuffle(mcc_index) + for mi,mcc in enumerate(MCCs): + for leaf in mcc: + if leaf not in leaf_to_MCC: + leaf_to_MCC[leaf] = [mcc_index[mi]] + else: + leaf_to_MCC[leaf].append(mcc_index[mi]) + return leaf_to_MCC + + +def assign_all_mccs(tree, num_other_trees, mcc_map, one_mutation=1e-4): + ''' + For each node in the tree, if it is part of a mcc clade, assign it to that clade using Fitch. + Do this for every MCC between the focal tree `tree` and another tree. + Additionally assign a minimal branch length to all branches to avoid any numerical issues. + + ''' + #child_mccs is a list of sets. Each set corresponds to which mccs the children of that node are in that segment. for leaf in tree.get_terminals(): - leaf.child_mccs = set([mcc_map[leaf.name]]) + leaf.child_mccs = [set([mcc_map[leaf.name][other_tree]]) for other_tree in range(num_other_trees)] leaf.mcc = mcc_map[leaf.name] leaf.branch_length = max(0.5*one_mutation, leaf.branch_length) - # reconstruct MCCs with Fitch algorithm for n in tree.get_nonterminals(order='postorder'): - common_mccs = set.intersection(*[c.child_mccs for c in n]) + common_mccs = [set.intersection(*[c.child_mccs[other_tree] for c in n]) for other_tree in range(num_other_trees)] n.branch_length = max(0.5*one_mutation, n.branch_length) - if len(common_mccs): - n.child_mccs = common_mccs + n.child_mccs = [] + for other_tree in range(num_other_trees): + if len(common_mccs[other_tree]): + n.child_mccs.append(common_mccs[other_tree]) + else: + n.child_mccs.append(set.union(*[c.child_mccs[other_tree] for c in n])) + mcc_intersection = [set.intersection(*[c.child_mccs[other_tree] for c in tree.root]) for other_tree in range(num_other_trees)] + tree.root.mcc = [] + for other_tree in range(num_other_trees): + if len(mcc_intersection[other_tree]): + tree.root.mcc.append(list(mcc_intersection[other_tree])[0]) else: - n.child_mccs = set.union(*[c.child_mccs for c in n]) - - mcc_intersection = set.intersection(*[c.child_mccs for c in tree.root]) - if len(mcc_intersection): - tree.root.mcc = list(mcc_intersection)[0] - else: - tree.root.mcc = None - + tree.root.mcc.append(None) for n in tree.get_nonterminals(order='preorder'): if n==tree.root: continue else: - if n.up.mcc in n.child_mccs: # parent MCC part of children -> that is the MCC - n.mcc = n.up.mcc - elif len(n.child_mccs)==1: # child is an MCC - n.mcc = list(n.child_mccs)[0] - else: # no unique child MCC and no match with parent -> not part of an MCCs - n.mcc = None + n.mcc = [] + for other_tree in range(num_other_trees): + if n.up.mcc[other_tree] in n.child_mccs[other_tree]: # parent MCC part of children -> that is the MCC + n.mcc.append(n.up.mcc[other_tree]) + elif len(n.child_mccs[other_tree])==1: # child is an MCC + n.mcc.append(list(n.child_mccs[other_tree])[0]) + else: # no unique child MCC and no match with parent -> not part of an MCCs + n.mcc.append(None) \ No newline at end of file diff --git a/treetime/argument_parser.py b/treetime/argument_parser.py index 338df618..056e0b54 100644 --- a/treetime/argument_parser.py +++ b/treetime/argument_parser.py @@ -308,11 +308,12 @@ def toplevel(params): ## ARG arg_parser = subparsers.add_parser('arg', - description="Calculates the root-to-tip regression and quantifies the 'clock-i-ness' of the tree. " - "It will reroot the tree to maximize the clock-like " - "signal and recalculate branch length unless run with --keep_root.") - arg_parser.add_argument('--trees', nargs=2, required=True, type=str) - arg_parser.add_argument('--alignments', nargs=2, required=True, type=str) + description="Command to use recombination event information to better estimate time trees, clock rates and ancestral reconstruction of sequences." + "Given trees, their alignments and a list of maximally compatible clades (shared topological" + "structures where no recombination events have occurred) this command will perform standard treetime inference" + "on all trees, using alignment information from other trees for inference on shared branches (branches in MCCs).") + arg_parser.add_argument('--trees', nargs='+', required=True, type=str) + arg_parser.add_argument('--alignments', nargs='+', required=True, type=str) arg_parser.add_argument('--mccs', required=True, type=str) add_timetree_args(arg_parser) add_time_arguments(arg_parser) @@ -321,7 +322,18 @@ def toplevel(params): add_reroot_group(arg_parser) add_common_args(arg_parser) - arg_parser.set_defaults(func=arg_time_trees) + + def toplevel_arg(params): + if len(params.trees) <2 or len(params.alignments) <2: + print(treetime_description+timetree_description+subcommand_description+ + "'arg' requires at least two tree and alignment files.\n") + elif len(params.trees) != len(params.alignments): + print(treetime_description+timetree_description+subcommand_description+ + "'arg' requires the same number of tree and alignment files, it is important that these are given in the same order.\n") + else: + arg_time_trees(params) + + arg_parser.set_defaults(func=toplevel_arg) # make a version subcommand diff --git a/treetime/treeanc.py b/treetime/treeanc.py index b07b2d06..15ea44bb 100644 --- a/treetime/treeanc.py +++ b/treetime/treeanc.py @@ -1585,7 +1585,7 @@ def cost_func(sqrt_mu): ############################################################################### ### Utility functions ############################################################################### - def get_reconstructed_alignment(self, reconstruct_tip_states=False): + def get_reconstructed_alignment(self, reconstruct_tip_states=False, aln_slice=None): """ Get the multiple sequence alignment, including reconstructed sequences for the internal nodes. @@ -1617,11 +1617,17 @@ def get_reconstructed_alignment(self, reconstruct_tip_states=False): new_aln['positions'] = self.data.nonref_positions new_aln['inferred_const_sites'] = self.data.inferred_const_sites else: - new_aln = MultipleSeqAlignment([SeqRecord(id=n.name, + if aln_slice: + start, end = aln_slice + new_aln = MultipleSeqAlignment([SeqRecord(id=n.name, + seq=Seq(self.sequence(n, reconstructed=reconstruct_tip_states, + as_string=True, compressed=False)[start:end]), description="") + for n in self.tree.find_clades()]) + else: + new_aln = MultipleSeqAlignment([SeqRecord(id=n.name, seq=Seq(self.sequence(n, reconstructed=reconstruct_tip_states, as_string=True, compressed=False)), description="") for n in self.tree.find_clades()]) - return new_aln diff --git a/treetime/wrappers.py b/treetime/wrappers.py index a5e2b905..ea64105c 100644 --- a/treetime/wrappers.py +++ b/treetime/wrappers.py @@ -162,14 +162,14 @@ def plot_rtt(tt, fname): def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False, report_ambiguous=False, timetree=False, confidence=False, - reconstruct_tip_states=False, tree_suffix=''): + reconstruct_tip_states=False, tree_suffix='', aln_slice=None): seq_info = is_vcf or tt.aln if is_vcf: outaln_name = basename + f'ancestral_sequences{tree_suffix}.vcf' write_vcf(tt.get_reconstructed_alignment(reconstruct_tip_states=reconstruct_tip_states), outaln_name) elif tt.aln: outaln_name = basename + f'ancestral_sequences{tree_suffix}.fasta' - AlignIO.write(tt.get_reconstructed_alignment(reconstruct_tip_states=reconstruct_tip_states), outaln_name, 'fasta') + AlignIO.write(tt.get_reconstructed_alignment(reconstruct_tip_states=reconstruct_tip_states, aln_slice=aln_slice), outaln_name, 'fasta') if seq_info: print("\n--- alignment including ancestral nodes saved as \n\t %s\n"%outaln_name) @@ -217,10 +217,12 @@ def export_sequences_and_tree(tt, basename, is_vcf=False, zero_based=False, if tt.gtr.ambiguous not in [a,d]])+'"' else: if report_ambiguous: - n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations if n.mask[pos]>0])+f'",mcc="{n.mcc}"' + n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations if n.mask[pos]>0])\ + +f'",mcc="{",".join([str(x) for x in n.mcc])}"' else: n.comment= '&mutations="' + ','.join([a+str(pos + offset)+d for (a,pos, d) in n.mutations - if tt.gtr.ambiguous not in [a,d] and n.mask[pos]>0])+f'",mcc="{n.mcc}"' + if tt.gtr.ambiguous not in [a,d] and n.mask[pos]>0])\ + +f'",mcc="{",".join([str(x) for x in n.mcc])}"' for (a, pos, d) in n.mutations: if tt.gtr.ambiguous not in [a,d] or report_ambiguous: @@ -484,22 +486,22 @@ def arg_time_trees(params): """ from .arg import parse_arg, setup_arg - arg_params = parse_arg(params.trees[0], params.trees[1], - params.alignments[0], params.alignments[1], params.mccs, + arg_params = parse_arg(params.trees, + params.alignments, params.mccs, fill_overhangs=not params.keep_overhangs) dates = utils.parse_dates(params.dates, date_col=params.date_column, name_col=params.name_column) root = None if params.keep_root else params.reroot - for i,(tree,mask) in enumerate(zip(arg_params['trees'], arg_params['masks'])): + for tree_name in arg_params['trees_dict'].keys(): outdir = get_outdir(params, f'_ARG-treetime') gtr = create_gtr(params) - tt = setup_arg(tree, arg_params['alignment'], arg_params['combined_mask'], mask, dates, arg_params['MCCs'], - gtr=gtr, verbose=params.verbose, fill_overhangs=not params.keep_overhangs, - fixed_clock_rate = params.clock_rate, reroot=root) - - run_timetree(tt, params, outdir, tree_suffix=f"_{i+1}", prune_short=False, method_anc=params.method_anc) + tt = setup_arg(tree_name, arg_params['trees_dict'], arg_params['alignment'], dates, arg_params['MCCs_dict'], arg_params['masks_dict'], + gtr=gtr, verbose=params.verbose, fill_overhangs=not params.keep_overhangs, + fixed_clock_rate = params.clock_rate, reroot=root) + + run_timetree(tt, params, outdir, tree_suffix=f"_"+tree_name, prune_short=False, method_anc=params.method_anc, aln_slice=arg_params["seg_pos_dict"][tree_name]) @@ -535,7 +537,7 @@ def timetree(params): return run_timetree(myTree, params, outdir) -def run_timetree(myTree, params, outdir, tree_suffix='', prune_short=True, method_anc='probabilistic'): +def run_timetree(myTree, params, outdir, tree_suffix='', prune_short=True, method_anc='probabilistic', aln_slice=None): ''' this function abstracts the time tree estimation that is used for regular treetime inference and for arg time tree inference. @@ -675,7 +677,7 @@ def run_timetree(myTree, params, outdir, tree_suffix='', prune_short=True, metho export_sequences_and_tree(myTree, basename, is_vcf, params.zero_based, timetree=True, confidence=calc_confidence, reconstruct_tip_states=params.reconstruct_tip_states, - tree_suffix=tree_suffix) + tree_suffix=tree_suffix, aln_slice=aln_slice) return 0