tutorials:perl:parse_blast_sim.6.pl
- parse_blast_sim.6.pl
#!/usr/bin/perl my $script_name = 'parse_blast_sim.6.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # parse blast results # input = blast # output = parsed format for sequence similarity, 1 hit per line # v6 2010/11/05 # add format opt, use xml as the default # the query_id is now obtained from query_description, not query_name # v5 2010/07/15 # style change # v4 2008/09/10 # change output format # v3 2007/02/26 # bug fix: correct calculation of frac_HSP # v2 2007/02/09 # set max(frac_HSP) to 1 (quick & dirty fix) # v1 2007/02/08 use strict; use warnings; use Getopt::Long; use File::Basename; use Bio::SearchIO; my $in_file; my $out_file; my $format; my $print_HSP; my $debug; GetOptions( "in_file=s" => \$in_file, "out_file=s" => \$out_file, "format=s" => \$format, "print_HSP=i" => \$print_HSP, "debug=i" => \$debug, ); $format = $format ? $format : 'blastxml'; my $searchio = new Bio::SearchIO( -format => "$format", -file => "$in_file" ); # open output file my $out_dir = dirname($out_file); system "mkdir -p $out_dir" unless -e $out_dir; open OUT, ">$out_file" or die "Can't open output file $out_file\n"; while ( my $result = $searchio->next_result ) { my $query_id = $result->query_description; my $query_length = $result->query_length; while ( my $hit = $result->next_hit ) { my $subject_id = $hit->name; my $subject_length = $hit->length; my $significance = sprintf "%.2e", ( $hit->significance ); my $num_HSP = $hit->num_hsps; my $num_identical = 0; my $num_conserved = 0; my $hsp_entry; my @hsp_array; my $count_HSP = 0; my %query_HSP_hash; # key = position, value = num_time in HSP foreach my $index ( 1 .. $query_length ) { $query_HSP_hash{$index} = 0; } while ( my $hsp = $hit->next_hsp ) { $count_HSP++; my $query_start = $hsp->start('query'); my $query_end = $hsp->end('query'); foreach my $index ( $query_start .. $query_end ) { $query_HSP_hash{$index}++; } my $subject_start = $hsp->start('hit'); my $subject_end = $hsp->end('hit'); my $num_identical_HSP = $hsp->num_identical; $num_identical += $num_identical_HSP; my $num_conserved_HSP = $hsp->num_conserved; $num_conserved += $num_conserved_HSP; my $total_length_HSP = $hsp->length('total'); $hsp_entry = "$count_HSP:" . "$query_start:$query_end:" . "$subject_start:$subject_end:" . "$num_identical_HSP:$num_conserved_HSP:" . "$total_length_HSP;"; push @hsp_array, $hsp_entry; } # while hsp my $num_site_in_HSP = 0; # num of sites of query in HSP # (<= query_length) my $total_HSP_length = 0; # total length of HSPs # (may > query_length) foreach my $index ( 1 .. $query_length ) { $num_site_in_HSP++ if ( $query_HSP_hash{$index} > 0 ); $total_HSP_length += $query_HSP_hash{$index}; } my $frac_HSP = 0; if ( $query_length > 0 ) { $frac_HSP = sprintf "%.3f", ( $num_site_in_HSP / $query_length ); } my $frac_identical = 0; my $frac_conserved = 0; if ( $total_HSP_length > 0 ) { $frac_identical = sprintf "%.3f", ( $num_identical / $total_HSP_length ); $frac_conserved = sprintf "%.3f", ( $num_conserved / $total_HSP_length ); } my $frac_identical_all = sprintf "%.3f", ( $frac_identical * $frac_HSP ); my $frac_conserved_all = sprintf "%.3f", ( $frac_conserved * $frac_HSP ); print OUT "$query_id\t$query_length\t", "$subject_id\t$subject_length\t$significance\t", "$frac_HSP\t$frac_identical\t$frac_conserved\t", "$frac_identical_all\t$frac_conserved_all\t$num_HSP"; if ($print_HSP) { print OUT "\t"; foreach $hsp_entry (@hsp_array) { print OUT $hsp_entry; } } print OUT "\n"; } # while subject } # while query close(OUT); exit(0);
tutorials/perl/parse_blast_sim.6.pl.txt · Last modified: 2012/06/15 00:30 by chkuo