User Tools

Site Tools


tutorials:perl:parse_blast_sim.6.pl
parse_blast_sim.6.pl
#!/usr/bin/perl
 
my $script_name = 'parse_blast_sim.6.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# parse blast results
# input = blast
# output = parsed format for sequence similarity, 1 hit per line
# v6 2010/11/05
#   add format opt, use xml as the default
#   the query_id is now obtained from query_description, not query_name
# v5 2010/07/15
#   style change
# v4 2008/09/10
#   change output format
# v3 2007/02/26
#   bug fix: correct calculation of frac_HSP
# v2 2007/02/09
#   set max(frac_HSP) to 1 (quick & dirty fix)
# v1 2007/02/08
 
use strict;
use warnings;
 
use Getopt::Long;
use File::Basename;
use Bio::SearchIO;
 
my $in_file;
my $out_file;
my $format;
my $print_HSP;
my $debug;
 
GetOptions(
    "in_file=s"   => \$in_file,
    "out_file=s"  => \$out_file,
    "format=s"    => \$format,
    "print_HSP=i" => \$print_HSP,
    "debug=i"     => \$debug,
);
$format = $format ? $format : 'blastxml';
 
my $searchio = new Bio::SearchIO(
    -format => "$format",
    -file   => "$in_file"
);
 
# open output file
my $out_dir = dirname($out_file);
system "mkdir -p $out_dir" unless -e $out_dir;
open OUT, ">$out_file" or die "Can't open output file $out_file\n";
 
while ( my $result = $searchio->next_result ) {
    my $query_id     = $result->query_description;
    my $query_length = $result->query_length;
 
    while ( my $hit = $result->next_hit ) {
        my $subject_id     = $hit->name;
        my $subject_length = $hit->length;
        my $significance   = sprintf "%.2e", ( $hit->significance );
 
        my $num_HSP = $hit->num_hsps;
 
        my $num_identical = 0;
        my $num_conserved = 0;
 
        my $hsp_entry;
        my @hsp_array;
 
        my $count_HSP = 0;
        my %query_HSP_hash;    # key = position, value = num_time in HSP
        foreach my $index ( 1 .. $query_length ) {
            $query_HSP_hash{$index} = 0;
        }
        while ( my $hsp = $hit->next_hsp ) {
            $count_HSP++;
            my $query_start = $hsp->start('query');
            my $query_end   = $hsp->end('query');
            foreach my $index ( $query_start .. $query_end ) {
                $query_HSP_hash{$index}++;
            }
 
            my $subject_start = $hsp->start('hit');
            my $subject_end   = $hsp->end('hit');
 
            my $num_identical_HSP = $hsp->num_identical;
            $num_identical += $num_identical_HSP;
            my $num_conserved_HSP = $hsp->num_conserved;
            $num_conserved += $num_conserved_HSP;
 
            my $total_length_HSP = $hsp->length('total');
 
            $hsp_entry = "$count_HSP:" . 
              "$query_start:$query_end:" . 
              "$subject_start:$subject_end:" . 
              "$num_identical_HSP:$num_conserved_HSP:" . 
              "$total_length_HSP;";
            push @hsp_array, $hsp_entry;
 
        } # while hsp
 
        my $num_site_in_HSP =  0; # num of sites of query in HSP 
                                  # (<= query_length)
        my $total_HSP_length = 0; # total length of HSPs 
                                  # (may > query_length)
 
        foreach my $index ( 1 .. $query_length ) {
            $num_site_in_HSP++ if ( $query_HSP_hash{$index} > 0 );
            $total_HSP_length += $query_HSP_hash{$index};
        }
 
        my $frac_HSP = 0;
        if ( $query_length > 0 ) {
            $frac_HSP = sprintf "%.3f", ( $num_site_in_HSP / $query_length );
        }
        my $frac_identical = 0;
        my $frac_conserved = 0;
        if ( $total_HSP_length > 0 ) {
            $frac_identical = sprintf "%.3f",
              ( $num_identical / $total_HSP_length );
            $frac_conserved = sprintf "%.3f",
              ( $num_conserved / $total_HSP_length );
        }
        my $frac_identical_all = 
          sprintf "%.3f", ( $frac_identical * $frac_HSP );
        my $frac_conserved_all = 
          sprintf "%.3f", ( $frac_conserved * $frac_HSP );
 
        print OUT "$query_id\t$query_length\t",
          "$subject_id\t$subject_length\t$significance\t",
          "$frac_HSP\t$frac_identical\t$frac_conserved\t",
          "$frac_identical_all\t$frac_conserved_all\t$num_HSP";
 
        if ($print_HSP) {
            print OUT "\t";
            foreach $hsp_entry (@hsp_array) {
                print OUT $hsp_entry;
            }
        }
 
        print OUT "\n";
 
    } # while subject
} # while query
 
close(OUT);
 
exit(0);
tutorials/perl/parse_blast_sim.6.pl.txt · Last modified: 2012/06/15 00:30 by chkuo