BLAST_TABULAR_OUTPUT Generator


Few MPI versions do not support the very useful -m 8 tabular output format. The following code can be used to parse the raw blast output obtained from mpi_blast to tabular output file. The creditials of original creator and modifications are given in comments section of the code.
#=========================================================================================
# Modified BLAST Parser  -  Script for parsing the BLAST output in tabular format
# Version 1.1.6 (November 26, 2015)
# Modified by Arun Prasanna (arunprasanna83@gmail.com) from orginal script by Kirill Kryukov (http://kirill-kryukov.com/kirr/). He also has many other useful utilities..Check out !!
#  
# 
# Purpose: Few versions of MPI-BLAST do not support tabular output or often the user faces 
# error while opting for tabular output. Hence the original code is modified to produce 
# tabular Output verbatim -m 8 option. The output file is compliant with other clustering 
# programs like Silix
# 
# Changes made are:
#  1. Added one more regex in $qnaming section to account (24,300 letters) instances 
#  2. Added $per_id, $rounded variable to calculate percentage identity
#  2. Modified print statement to convert in tabular format
#
#=========================================================================================
# BLAST Parser  -  Script for parsing the BLAST output
#
# Version 1.1.5 (June 27, 2012)
#
# Copyright (c) 2011-2012 Kirill Kryukov
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
# 1. The origin of this software must not be misrepresented; you must not
#    claim that you wrote the original software. If you use this software
#    in a product, an acknowledgment in the product documentation would be
#     appreciated but is not required.
# 2. Altered source versions must be plainly marked as such, and must not be
#    misrepresented as being the original software.
# 3. This notice may not be removed or altered from any source distribution.
#
# This script is a BLAST parser - it will process the BLAST output and convert
# it into a compact form where each hit is represented by a single line.
#
# Usage:
#   perl blast_parser.pl <blastout.txt >parsed.txt
#
# You can also use it to parse the BLAST output on the fly as it is generated,
# saving a lot of disk space (although losing all alignments).
#========================================================================================
use strict;
my ($qname,$sname) = ('','');
my ($qchanged,$schanged,$hchanged) = (0,0,0);
my ($qnaming,$snaming,$qlen,$slen) = (0,0,0,0);
my ($qstart,$qend,$sstart,$send) = (-1,-1,-1,-1);
my ($bits,$expect,$length,$ident,$positives,$gaps,$frame,$per_id) = (-1,'',-1,-1,-1,-1,'',''); #added per_id with '' 
my ($n_queries,$n_hits) = (0,0);

while (<>)
{
    chomp;

    if ($qnaming)
    {
        if ($_ =~ /^\s*Length=(\d+)$/ or $_ =~ /^\s*\((\d+) letters\)$/ or $_ =~ /^\s*\((\d+,\d+) letters\)$/) #added third or statement to check for comma
        {
            $qlen = $1;
            $qnaming = 0;
            #print STDERR "Query = $qname\n";
            #print STDERR ">";
        }
        else { $qname .= " $_"; }
        next;
    }
    #print "The value in qlen is: $qlen\n";
    #print "The value in qname is: $qname\n";

    if ($snaming)
    {
        if (/^\s*Length\s*=\s*(\d+)$/)
        {
            $slen = $1;
            $snaming = 0;
            #print STDERR "    Subject = $sname\n";
            #print STDERR ":";
        }
        else { s/^\s+//; $sname .= " $_"; }
        next;
    }
    #print "The value in slen is: $slen\n";
    #print "The value in sname is: $sname\n";

    
    if (/^Query[\:\s]\s(\d+)\s.*\s(\d+)$/)   #Changed if to while to check if it can still look for fragments
    {
        if ($qstart < 0 or $1 < $qstart) { $qstart = $1; }
        if ($qend < 0 or $1 > $qend) { $qend = $1; }
        if ($2 < $qstart) { $qstart = $2; }
        if ($2 > $qend) { $qend = $2; }
        next;
    }
    #print "The value in qstart is: $qstart\n";
    #print "The value in qend is: $qend\n";
    
    if (/^Sbjct[\:\s]\s(\d+)\s.*\s(\d+)$/)    #Changed if to while to check if it can still look for fragments
    {
        if ($sstart < 0 or $1 < $sstart) { $sstart = $1; }
        if ($send < 0 or $1 > $send) { $send = $1; }
        if ($2 < $sstart) { $sstart = $2; }
        if ($2 > $send) { $send = $2; }
        next;
    }
    #print "The value in sstart is: $sstart\n";
    #print "The value in send is: $send\n";
    
    if (/^\sScore/)
    {
        if ($snaming) { print STDERR "Error: can't complete subject name: $sname\n"; next; }
        if ($qnaming) { print STDERR "Error: can't complete query name: $qname\n"; next; }
        #&flush_hit();   #not letting it flush hit
        ($qstart,$qend,$sstart,$send) = (-1,-1,-1,-1);
        ($bits,$expect,$length,$ident,$positives,$gaps,$frame) = (-1,'',-1,-1,-1,-1,'');

        if (/^\sScore\s+=\s+([\d\+e\.]+)\s+bits/) { $bits = int($1); } else { print STDERR "Error: Can't parse score: \"$_\"\n"; }
        if (/\s+Expect\(?\d*\)?\s+=\s+([e\-0-9\.]+)/) { $expect = $1; } else { print STDERR "Error: Can't parse expectation: \"$_\"\n"; }
        $hchanged = 1;
        next;
    }

    if (/^\sIdentities/)
    {
        ($length,$ident,$positives,$gaps,$frame) = (-1,-1,-1,-1,'');
        if (/^\sIdentities\s+=\s+(\d+)\/(\d+)\s+\(\d+\%\)/)
        {
            ($ident,$length) = ($1,$2);
        }
        if (/\s+Positives\s+=\s+(\d+)\/\d+\s+\(\d+\%\)/)
        {
            ($positives) = ($1);
        }
        if (/Gaps\s+=\s+(\d+)\//)
        {
            $gaps = $1;
        }
        else
        {
            $gaps = 0;
        }
        if ($length < 0)
        {
            print STDERR "Error: unexpected BLAST ' Identities = ' line format: $_\n";
        }
        next;
    }

    if (/\s[Ff]rame\s=\s(\S+)$/)
    {
        $frame = $1;
        if ($frame !~ /^(-?\+?\d|-?\+?\d\/-?\+?\d)$/) { print STDERR "Error: Unexpected frame: \"$frame\"\n"; }
        next;
    }

    if (/\sStrand\s=\s(\S+)$/)
    {
        $frame = $1;
        if ($frame !~ /^Plus\/(Plus|Minus)$/) { print STDERR "Error: Unexpected strand: \"$frame\"\n"; }
        next;
    }

    if (/^Query=\s+(.+)$/)
    {
        my $new_qname = $1;
        &flush_hit();
        $qname = $new_qname;
        $qnaming = 1;
        $qchanged = 1;
        $n_queries++;
        next;
    }

    if (/^>(.+)$/)
    {
        my $new_sname = $1;
        &flush_hit();
        $sname = $new_sname;
        $snaming = 1;
        $schanged = 1;
        next;
    }
}

&flush_hit();
print STDERR "OK, processed $n_queries queries, $n_hits hits\n";


sub flush_hit
{
    unless ($hchanged) { return; }
    if ($qstart<0 || $qend<0 || $sstart<0 || $send<0)
    {
        print STDERR "Error: no coordinates!\n";
        $hchanged = 0;
        return;
    }
    if ($bits<0 || $expect eq '' || $length<0 || $ident<0)
    {
        print STDERR "Error: parameters missing!\n";
        $hchanged = 0;
        return;
    }
    if ($frame eq '') { $frame = '-'; }

    #if ($frame eq '')
    #{
    #    print STDERR "Error: Frame/Strand is missing!\n";
    #    $hchanged = 0;
    #    return;
    #}

    if ($positives < 0) { $positives = '-'; }
    if ($gaps < 0) { $gaps = '-'; }

    if ($qchanged)
    {
        $qname =~ s/\s+$//;
        #print "\n$qname\n";  #print "\n$qname ($qlen)\n";
        $qchanged = 0;
        $schanged = 1;
    }

    if ($schanged)
    {
        $sname =~ s/\s{2,}/ /g;
        $sname =~ s/\s+$//;
        $sname =~ s/^\s+//;
        $sname =~ s/^lcl\|//;
        #print "\t$sname\t"; #print "     $sname ($slen)\n";
        $schanged = 0;
    }
    
    $per_id = ($ident/$length)*100;
    my $rounded = sprintf "%.2f", $per_id;

    print "$qname\t";
    print "$sname\t";
    print "$rounded\t";
    print "$length\t";
    print "$positives\t";
    print "$gaps\t";
    print "$qstart\t";
    print "$qend\t";
    print "$sstart\t";
    print "$send\t";
    print "$expect\t"; 
    print "$bits\n";
    $hchanged = 0;
    $frame = '';

    $n_hits++;
}

Comments

Popular posts from this blog

Pick Matching lines with list of keywords

Install Parallel versions of Python from source

Map multiple annotations using pandas