User Tools

Site Tools


tutorials:perl:parse_blast_hsp.4.pl
parse_blast_hsp.4.pl
#!/usr/bin/perl
 
my $script_name = 'parse_blast_hsp.4.pl';
 
# parse blast results
# output = parsed format for sequence similarity, 1 hsp per line
# v4 2010/09/20
#   bug fix: hsp length (should be 'total', not 'query')
#   output format change
# v3 2010/08/05
#   add strand info
# v2 2010/07/15
#   style change
#   change output format
# v1 2007/04/16
 
use strict;
use warnings;
 
use Getopt::Long;
use Bio::SearchIO;
 
my $in_file;
my $out_file;
my $format;
my $print_seq;
my $debug;
 
GetOptions(
    "in_file=s"   => \$in_file,
    "out_file=s"  => \$out_file,
    "format=s"    => \$format,
    "print_seq=i" => \$print_seq,
    "debug=i"     => \$debug,
);
$format = $format ? $format : 'blastxml';
 
my $searchio = new Bio::SearchIO(
    -format => "$format",
    -file   => "$in_file",
);
 
# open output file
open OUT, ">$out_file" or die "Can't open output file $out_file\n";
 
while ( my $result = $searchio->next_result ) {
    my $query_id     = $result->query_description;
    my $query_length = $result->query_length;
 
    while ( my $hit = $result->next_hit ) {
        my $subject_id     = $hit->name;
        my $subject_length = $hit->length;
 
        while ( my $hsp = $hit->next_hsp ) {
            my $evalue = sprintf "%.2e", ( $hsp->evalue );
 
            my $query_start  = $hsp->start('query');
            my $query_end    = $hsp->end('query');
            my $query_strand = $hsp->strand('query');
 
            my $subject_start  = $hsp->start('hit');
            my $subject_end    = $hsp->end('hit');
            my $subject_strand = $hsp->strand('hit');
 
            my $num_identical_HSP  = $hsp->num_identical;
            my $num_conserved_HSP  = $hsp->num_conserved;
            my $num_gap_HSP        = $hsp->gaps;
            my $length_HSP_query   = $hsp->length('query');
            my $length_HSP_subject = $hsp->length('hit');
            my $length_HSP_total   = $hsp->length('total');
 
            my $frac_HSP_query = sprintf "%.3f", 
              ( $length_HSP_query / $query_length );
            my $frac_HSP_subject = sprintf "%.3f", 
              ( $length_HSP_subject / $subject_length );
            my $frac_identical = sprintf "%.3f",
              ( $num_identical_HSP / $length_HSP_total );
            my $frac_conserved = sprintf "%.3f",
              ( $num_conserved_HSP / $length_HSP_total );
            my $frac_gap = sprintf "%.3f",
              ( $num_gap_HSP / $length_HSP_total );
 
            print OUT "$query_id",    # 0
              "\t$query_length",      # 1
              "\t$subject_id",        # 2
              "\t$subject_length",    # 3
              "\t$evalue",            # 4
              "\t$query_start",       # 5
              "\t$query_end",         # 6
              "\t$query_strand",      # 7
              "\t$frac_HSP_query",    # 8
              "\t$subject_start",     # 9
              "\t$subject_end",       # 10
              "\t$subject_strand",    # 11
              "\t$frac_HSP_subject",  # 12
              "\t$length_HSP_total",  # 13
              "\t$num_identical_HSP", # 14
              "\t$frac_identical",    # 15
              "\t$num_conserved_HSP", # 16
              "\t$frac_conserved",    # 17
              "\t$num_gap_HSP",       # 18
              "\t$frac_gap";          # 19
            if ($print_seq) {
                print OUT "\t", $hsp->query_string; # 20
                print OUT "\t", $hsp->hit_string;   # 21
            }
            print OUT "\n";
 
        }    # while hsp
 
    }    # while subject
}    # while query
 
close(OUT);
 
exit(0);
tutorials/perl/parse_blast_hsp.4.pl.txt · Last modified: 2012/06/15 00:31 by chkuo