#!/usr/bin/perl -w # Jennifer Meneghin # 08/14/2007 # This script removes duplicate records from a "short" format BLAST output file, and keeps only the "best" records (sorts by smallest e-value and then biggest percent identity) # Usage: blast_best.pl #----------------------------------------------------------------------------------------------------------------------------------------- #Deal with passed parameters #----------------------------------------------------------------------------------------------------------------------------------------- #If no arguments are passed, show usage message and exit program. if ($#ARGV == -1) { usage("BLAST BEST 1.0 2007"); exit; } #get the names of the input file (first argument passed) and output file (second argument passed) $in_file = $ARGV[0]; $out_file = $ARGV[1]; #Open the input file for reading, open the output file for writing. #If either are unsuccessful, print an error message and exit program. unless ( open(IN, "$in_file") ) { usage("Got a bad input file: $in_file"); exit; } unless ( open(OUT, ">$out_file") ) { usage("Got a bad output file: $out_file"); exit; } #Everything looks good. Print the parameters we've found. print "Parameters:\ninput file = $in_file\noutput file = $out_file\n\n"; #----------------------------------------------------------------------------------------------------------------------------------------- #The main event #----------------------------------------------------------------------------------------------------------------------------------------- $counter = 0; $total_counter = 0; print "De-duplicating File...\n"; @in = ; #Do stuff for each line of text in the input file. foreach $line (@in) { #if the line starts with a pound symbol, it is not real data, so skip this line. if ( $line =~ /^#/ ) { next; } #Count the total number of data lines in the file. $total_counter++; #The chomp commands removes any new line (and carriage return) characters from the end of the line. chomp($line); #Split up the tab delimited line, naming only the variables we are interested in. ($id, $junk, $percent_identity, $junk, $junk, $junk, $junk, $junk, $junk, $junk, $evalue, $junk) = split(/\t/, $line); #check to see if the id label is already in the list of ids (called dedupe) #if its not there, add it. if ( $dedupe{$id} ) { #if it is, look at the old line to see if it is still "better" than the new one. ($junk, $junk, $list_percent_identity,, $junk, $junk, $junk, $junk, $junk, $junk, $junk, $list_evalue, $junk) = split(/\t/,$dedupe{$id}); #if the new evalue is better than the old one, change the value of this id to the new line. #otherwise, if the the new evalue is the same, and the percent_identity is better, change the value of this id to the new line. #otherwise, don't do anything (keep the old line). if ( $evalue < $list_evalue ) { $dedupe{$id} = $line; } elsif ( $evalue == $list_evalue ) { if ( $percent_identity > $list_percent_identity ) { $dedupe{$id} = $line; } } } else { $dedupe{$id} = $line; #count the number of non-duplicated lines we have. $counter++; } } print "Total # records = $total_counter\nBest only # records = $counter\n"; print "Writing to output file...\n"; #Print the final "dedupe" list to the new file (adding the new line back on the end). foreach $id (sort keys %dedupe) { print OUT "$dedupe{$id}\n"; } #Close the files. close(IN); close(OUT); print "Done.\n"; #----------------------------------------------------------------------------------------------------------------------------------------- #Subroutines #----------------------------------------------------------------------------------------------------------------------------------------- sub usage { my($message) = @_; print "\n$message\n"; print "\nThis script removes duplicate records from a \"short\" format BLAST output file, and keeps only the \"best\" records.\nIt sorts by smallest e-value and then biggest percent identity.\n"; print "Usage: blast_best.pl \n"; print "\nJennifer Meneghin\n"; print "08/15/2007\n"; }