Few MPI versions do not support the very useful -m 8 tabular output format. The following code can be used to parse the raw blast output obtained from mpi_blast to tabular output file. The creditials of original creator and modifications are given in comments section of the code.
#=========================================================================================
# Modified BLAST Parser - Script for parsing the BLAST output in tabular format
# Version 1.1.6 (November 26, 2015)
# Modified by Arun Prasanna (arunprasanna83@gmail.com) from orginal script by Kirill Kryukov (http://kirill-kryukov.com/kirr/). He also has many other useful utilities..Check out !!
#
#
# Purpose: Few versions of MPI-BLAST do not support tabular output or often the user faces
# error while opting for tabular output. Hence the original code is modified to produce
# tabular Output verbatim -m 8 option. The output file is compliant with other clustering
# programs like Silix
#
# Changes made are:
# 1. Added one more regex in $qnaming section to account (24,300 letters) instances
# 2. Added $per_id, $rounded variable to calculate percentage identity
# 2. Modified print statement to convert in tabular format
#
#=========================================================================================
# BLAST Parser - Script for parsing the BLAST output
#
# Version 1.1.5 (June 27, 2012)
#
# Copyright (c) 2011-2012 Kirill Kryukov
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
# 1. The origin of this software must not be misrepresented; you must not
# claim that you wrote the original software. If you use this software
# in a product, an acknowledgment in the product documentation would be
# appreciated but is not required.
# 2. Altered source versions must be plainly marked as such, and must not be
# misrepresented as being the original software.
# 3. This notice may not be removed or altered from any source distribution.
#
# This script is a BLAST parser - it will process the BLAST output and convert
# it into a compact form where each hit is represented by a single line.
#
# Usage:
# perl blast_parser.pl <blastout.txt >parsed.txt
#
# You can also use it to parse the BLAST output on the fly as it is generated,
# saving a lot of disk space (although losing all alignments).
#========================================================================================
use strict;
my ($qname,$sname) = ('','');
my ($qchanged,$schanged,$hchanged) = (0,0,0);
my ($qnaming,$snaming,$qlen,$slen) = (0,0,0,0);
my ($qstart,$qend,$sstart,$send) = (-1,-1,-1,-1);
my ($bits,$expect,$length,$ident,$positives,$gaps,$frame,$per_id) = (-1,'',-1,-1,-1,-1,'',''); #added per_id with ''
my ($n_queries,$n_hits) = (0,0);
while (<>)
{
chomp;
if ($qnaming)
{
if ($_ =~ /^\s*Length=(\d+)$/ or $_ =~ /^\s*\((\d+) letters\)$/ or $_ =~ /^\s*\((\d+,\d+) letters\)$/) #added third or statement to check for comma
{
$qlen = $1;
$qnaming = 0;
#print STDERR "Query = $qname\n";
#print STDERR ">";
}
else { $qname .= " $_"; }
next;
}
#print "The value in qlen is: $qlen\n";
#print "The value in qname is: $qname\n";
if ($snaming)
{
if (/^\s*Length\s*=\s*(\d+)$/)
{
$slen = $1;
$snaming = 0;
#print STDERR " Subject = $sname\n";
#print STDERR ":";
}
else { s/^\s+//; $sname .= " $_"; }
next;
}
#print "The value in slen is: $slen\n";
#print "The value in sname is: $sname\n";
if (/^Query[\:\s]\s(\d+)\s.*\s(\d+)$/) #Changed if to while to check if it can still look for fragments
{
if ($qstart < 0 or $1 < $qstart) { $qstart = $1; }
if ($qend < 0 or $1 > $qend) { $qend = $1; }
if ($2 < $qstart) { $qstart = $2; }
if ($2 > $qend) { $qend = $2; }
next;
}
#print "The value in qstart is: $qstart\n";
#print "The value in qend is: $qend\n";
if (/^Sbjct[\:\s]\s(\d+)\s.*\s(\d+)$/) #Changed if to while to check if it can still look for fragments
{
if ($sstart < 0 or $1 < $sstart) { $sstart = $1; }
if ($send < 0 or $1 > $send) { $send = $1; }
if ($2 < $sstart) { $sstart = $2; }
if ($2 > $send) { $send = $2; }
next;
}
#print "The value in sstart is: $sstart\n";
#print "The value in send is: $send\n";
if (/^\sScore/)
{
if ($snaming) { print STDERR "Error: can't complete subject name: $sname\n"; next; }
if ($qnaming) { print STDERR "Error: can't complete query name: $qname\n"; next; }
#&flush_hit(); #not letting it flush hit
($qstart,$qend,$sstart,$send) = (-1,-1,-1,-1);
($bits,$expect,$length,$ident,$positives,$gaps,$frame) = (-1,'',-1,-1,-1,-1,'');
if (/^\sScore\s+=\s+([\d\+e\.]+)\s+bits/) { $bits = int($1); } else { print STDERR "Error: Can't parse score: \"$_\"\n"; }
if (/\s+Expect\(?\d*\)?\s+=\s+([e\-0-9\.]+)/) { $expect = $1; } else { print STDERR "Error: Can't parse expectation: \"$_\"\n"; }
$hchanged = 1;
next;
}
if (/^\sIdentities/)
{
($length,$ident,$positives,$gaps,$frame) = (-1,-1,-1,-1,'');
if (/^\sIdentities\s+=\s+(\d+)\/(\d+)\s+\(\d+\%\)/)
{
($ident,$length) = ($1,$2);
}
if (/\s+Positives\s+=\s+(\d+)\/\d+\s+\(\d+\%\)/)
{
($positives) = ($1);
}
if (/Gaps\s+=\s+(\d+)\//)
{
$gaps = $1;
}
else
{
$gaps = 0;
}
if ($length < 0)
{
print STDERR "Error: unexpected BLAST ' Identities = ' line format: $_\n";
}
next;
}
if (/\s[Ff]rame\s=\s(\S+)$/)
{
$frame = $1;
if ($frame !~ /^(-?\+?\d|-?\+?\d\/-?\+?\d)$/) { print STDERR "Error: Unexpected frame: \"$frame\"\n"; }
next;
}
if (/\sStrand\s=\s(\S+)$/)
{
$frame = $1;
if ($frame !~ /^Plus\/(Plus|Minus)$/) { print STDERR "Error: Unexpected strand: \"$frame\"\n"; }
next;
}
if (/^Query=\s+(.+)$/)
{
my $new_qname = $1;
&flush_hit();
$qname = $new_qname;
$qnaming = 1;
$qchanged = 1;
$n_queries++;
next;
}
if (/^>(.+)$/)
{
my $new_sname = $1;
&flush_hit();
$sname = $new_sname;
$snaming = 1;
$schanged = 1;
next;
}
}
&flush_hit();
print STDERR "OK, processed $n_queries queries, $n_hits hits\n";
sub flush_hit
{
unless ($hchanged) { return; }
if ($qstart<0 || $qend<0 || $sstart<0 || $send<0)
{
print STDERR "Error: no coordinates!\n";
$hchanged = 0;
return;
}
if ($bits<0 || $expect eq '' || $length<0 || $ident<0)
{
print STDERR "Error: parameters missing!\n";
$hchanged = 0;
return;
}
if ($frame eq '') { $frame = '-'; }
#if ($frame eq '')
#{
# print STDERR "Error: Frame/Strand is missing!\n";
# $hchanged = 0;
# return;
#}
if ($positives < 0) { $positives = '-'; }
if ($gaps < 0) { $gaps = '-'; }
if ($qchanged)
{
$qname =~ s/\s+$//;
#print "\n$qname\n"; #print "\n$qname ($qlen)\n";
$qchanged = 0;
$schanged = 1;
}
if ($schanged)
{
$sname =~ s/\s{2,}/ /g;
$sname =~ s/\s+$//;
$sname =~ s/^\s+//;
$sname =~ s/^lcl\|//;
#print "\t$sname\t"; #print " $sname ($slen)\n";
$schanged = 0;
}
$per_id = ($ident/$length)*100;
my $rounded = sprintf "%.2f", $per_id;
print "$qname\t";
print "$sname\t";
print "$rounded\t";
print "$length\t";
print "$positives\t";
print "$gaps\t";
print "$qstart\t";
print "$qend\t";
print "$sstart\t";
print "$send\t";
print "$expect\t";
print "$bits\n";
$hchanged = 0;
$frame = '';
$n_hits++;
}
Comments
Post a Comment