#!/bin/perl # ------------------------------------------------------------------------- # TANGO: Taxonomic Assignment in Metagenomics (tango.pl version 1.2.0) # Copyright (c) 2010 Jose C. Clemente, Jesper Jansson, and Gabriel Valiente # ------------------------------------------------------------------------- # usage: # perl tango.pl [tree.tre] [reads.txt] [q = 0.5] > [output.txt] # # input: # [tree.tree] is a taxonomy tree in Newick format, corresponding to a # classification of biological species in seven taxonomic ranks: # kingdom, phylum, class, order, family, genus, and species # [reads.txt] is a text file containing the parsed output of a mapping # program, in the format: [read_id] [species_id_1] ... [species_id_n] # [q = 0.5] is a parameter that allows balancing the taxonomic assignment # between precision (q = 0) and recall (q = 1), with q = 0.5 (default) # corresponding to the F-measure (harmonic mean of precision and # recall) # # output: # for each line in the input [reads.txt] file: # read_id # for each node_id in [tree.tre] with optimal precision and recall # node_id to which read_id was assigned # taxonomic_rank at which read_id was assigned # ------------------------------------------------------------------------- use Bio::TreeIO; use strict; my $tree_file = $ARGV[0]; # "lineages.tre" my $read_file = $ARGV[1]; # "reads.match" my $q = $ARGV[2]; # [0..1] # $q = 0.5 if scalar @ARGV == 2; $q = 0.5 if @ARGV == 2; die "Usage: perl tango.pl [tree.tre] [reads.txt] [q = 0.5] > [output.txt]\n" unless -e $tree_file and -e $read_file; my $in = Bio::TreeIO->new(-file => $tree_file,-format => 'newick'); my $tree = $in->next_tree; my @rank = qw(kingdom phylum class order family genus species); open IN, "<$read_file"; while () { chomp; my @reads = split /\s+/, $_; my $id = shift @reads; if (scalar @reads >= 2) { my $desc = compute_precision_recall(\@reads,$tree); my ($nodes, $penalty) = assign_reads($desc,\@reads,$tree); print "$id"; for my $node (@{$nodes}) { print " ",$node->id; my $temp = $node; my @lineage = ($temp); while ($temp->ancestor) { $temp = $temp->ancestor; unshift @lineage, $temp; } my $rank = @rank[scalar(@lineage)-1]; print " ($rank)"; } print "\n"; } } close IN; exit; sub compute_precision_recall { my $reads = shift; my @reads = @{$reads}; my $tree = shift; my @leaves = map { $tree->find_node(-id => $_) } @reads; my $lca = $tree->get_lca(@leaves); my @desc = $lca->get_all_Descendents; my @desc_leaves = grep { $_->is_Leaf } @desc; push @desc, $lca; for my $node (@desc) { $node->remove_tag('match'); $node->remove_tag('nonmatch'); $node->remove_tag('tp'); $node->remove_tag('fp'); $node->remove_tag('tn'); $node->remove_tag('fn'); } for my $read (@reads) { my $node = $tree->find_node(-id => $read); $node->add_tag_value('match',1); $node->add_tag_value('nonmatch',0); } for my $node (@desc) { my @matches = grep { $_->is_Leaf && $_->get_tag_values('match') } ($node,$node->get_all_Descendents); my $m = scalar @matches; my $n = 1 + scalar($node->get_all_Descendents) - $m; $node->add_tag_value('match',$m); $node->add_tag_value('nonmatch',$n); } my $mm = $lca->get_tag_values('match'); my $nn = $lca->get_tag_values('nonmatch'); for my $node (@desc) { my $m = $node->get_tag_values('match'); my $n = $node->get_tag_values('nonmatch'); my $tp = $m; my $fp = $n; my $tn = $nn - $n; my $fn = $mm - $m; $node->add_tag_value('tp',$tp); $node->add_tag_value('tn',$tn); $node->add_tag_value('fp',$fp); $node->add_tag_value('fn',$fn); } return \@desc; } sub assign_reads { my $desc = shift; my @desc = @{$desc}; my $reads = shift; my @reads = @{$reads}; my $tree = shift; my $penalty_min = 9**9**9; # infinity my @node_min; for my $node (@desc) { my $tp = $node->get_tag_values('tp'); my $fp = $node->get_tag_values('fp'); my $tn = $node->get_tag_values('tn'); my $fn = $node->get_tag_values('fn'); my $penalty = $q * $fn + (1-$q) * $fp; if ($tp != 0) { $penalty = $q * $fn / $tp + (1-$q) * $fp / $tp; } if ($penalty < $penalty_min) { $penalty_min = $penalty; @node_min = ($node); } elsif ($penalty == $penalty_min) { push @node_min, $node; } } return (\@node_min, $penalty_min); }