#!/usr/bin/perl -w
# Jennifer Meneghin
# 08/14/2007
# This script removes duplicate records from a "short" format BLAST output file, and keeps only the "best" records (sorts by smallest e-value and then biggest percent identity)
# Usage: blast_best.pl
#-----------------------------------------------------------------------------------------------------------------------------------------
#Deal with passed parameters
#-----------------------------------------------------------------------------------------------------------------------------------------
#If no arguments are passed, show usage message and exit program.
if ($#ARGV == -1) {
usage("BLAST BEST 1.0 2007");
exit;
}
#get the names of the input file (first argument passed) and output file (second argument passed)
$in_file = $ARGV[0];
$out_file = $ARGV[1];
#Open the input file for reading, open the output file for writing.
#If either are unsuccessful, print an error message and exit program.
unless ( open(IN, "$in_file") ) {
usage("Got a bad input file: $in_file");
exit;
}
unless ( open(OUT, ">$out_file") ) {
usage("Got a bad output file: $out_file");
exit;
}
#Everything looks good. Print the parameters we've found.
print "Parameters:\ninput file = $in_file\noutput file = $out_file\n\n";
#-----------------------------------------------------------------------------------------------------------------------------------------
#The main event
#-----------------------------------------------------------------------------------------------------------------------------------------
$counter = 0;
$total_counter = 0;
print "De-duplicating File...\n";
@in = ;
#Do stuff for each line of text in the input file.
foreach $line (@in) {
#if the line starts with a pound symbol, it is not real data, so skip this line.
if ( $line =~ /^#/ ) {
next;
}
#Count the total number of data lines in the file.
$total_counter++;
#The chomp commands removes any new line (and carriage return) characters from the end of the line.
chomp($line);
#Split up the tab delimited line, naming only the variables we are interested in.
($id, $junk, $percent_identity, $junk, $junk, $junk, $junk, $junk, $junk, $junk, $evalue, $junk) = split(/\t/, $line);
#check to see if the id label is already in the list of ids (called dedupe)
#if its not there, add it.
if ( $dedupe{$id} ) {
#if it is, look at the old line to see if it is still "better" than the new one.
($junk, $junk, $list_percent_identity,, $junk, $junk, $junk, $junk, $junk, $junk, $junk, $list_evalue, $junk) = split(/\t/,$dedupe{$id});
#if the new evalue is better than the old one, change the value of this id to the new line.
#otherwise, if the the new evalue is the same, and the percent_identity is better, change the value of this id to the new line.
#otherwise, don't do anything (keep the old line).
if ( $evalue < $list_evalue ) {
$dedupe{$id} = $line;
}
elsif ( $evalue == $list_evalue ) {
if ( $percent_identity > $list_percent_identity ) {
$dedupe{$id} = $line;
}
}
}
else {
$dedupe{$id} = $line;
#count the number of non-duplicated lines we have.
$counter++;
}
}
print "Total # records = $total_counter\nBest only # records = $counter\n";
print "Writing to output file...\n";
#Print the final "dedupe" list to the new file (adding the new line back on the end).
foreach $id (sort keys %dedupe) {
print OUT "$dedupe{$id}\n";
}
#Close the files.
close(IN);
close(OUT);
print "Done.\n";
#-----------------------------------------------------------------------------------------------------------------------------------------
#Subroutines
#-----------------------------------------------------------------------------------------------------------------------------------------
sub usage {
my($message) = @_;
print "\n$message\n";
print "\nThis script removes duplicate records from a \"short\" format BLAST output file, and keeps only the \"best\" records.\nIt sorts by smallest e-value and then biggest percent identity.\n";
print "Usage: blast_best.pl \n";
print "\nJennifer Meneghin\n";
print "08/15/2007\n";
}