tutorials:perl:find_homopolymer.3.pl
- find_homopolymer.3.pl
#!/usr/bin/perl -w my $script_name = 'find_homopolymer.3.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # read fasta file, find homopolymeric regions in the seqs # v3 2010/07/13 # bug fix: process the last base # change output format # v2 2009/09/29 # report the position of homopolyers in GFF3 format # v1 2009/09/22 use strict; use warnings; use Getopt::Long; use File::Basename; my $in_file; my $out_file; my $min; my $verbose; my $debug; GetOptions( "in_file=s" => \$in_file, "out_file=s" => \$out_file, "min=i" => \$min, "verbose=i" => \$verbose, "debug=i" => \$debug, ); $min = $min ? $min : '5'; my $out_dir = dirname($out_file); system "mkdir -p $out_dir" unless -e $out_dir; # read in_file my %seq_hash; # key = seq_name, value = seq my %length_hash; # key = seq_name, value = seq_length { open IN, "<$in_file" or die "Can't open input file $in_file: $!\n"; # redefine the record separator local $/ = ">"; my $in_line = <IN>; # toss the first record, which only consists of ">" while ( $in_line = <IN> ) { chomp $in_line; my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); $seq =~ tr/ \t\n\r//d; # Remove whitespace $seq_hash{$seq_name} = uc $seq; # convert seq to all upper case $length_hash{$seq_name} = length $seq; } close IN; } open OUT, ">$out_file" or die "Can't open output file $out_file\n"; my @seq_names = sort keys %seq_hash; my $count_seq = scalar @seq_names; foreach my $seq_name (@seq_names) { my %end_hash; # key = start, value = end my %size_hash; # key = start, value = size my %base_hash; # key = start, value = base # pre-process the 1st base my @chars = split( //, $seq_hash{$seq_name} ); my $pre = shift @chars; my $start = 1; # set start my $size = 1; # set size my $position = 1; # set position foreach my $char (@chars) { # update position $position++; if ( $char eq $pre ) { # the current base is the same as the previous one $size++; } else { # the current base is different from the previous one # terminate the extension if ( $size >= $min ) { # if the size reaches the report threshold $end_hash{$start} = ( $position - 1 ); $size_hash{$start} = $size; $base_hash{$start} = $pre; } else { # do nothing } # reset $start = $position; $size = 1; } # update $pre $pre = $char; } # process the last char if ( $size >= $min ) { # if the size reaches the report threshold $end_hash{$start} = $position; $size_hash{$start} = $size; $base_hash{$start} = $pre; } else { # do nothing } # print location my $count = 0; foreach $start ( sort { $a <=> $b } keys %end_hash ) { $count++; print OUT "$seq_name\t$count\t", "$start\t$end_hash{$start}\t$size_hash{$start}\t", "$base_hash{$start}\n"; } } close OUT; if ($verbose) { } if ($debug) { } exit(0);
tutorials/perl/find_homopolymer.3.pl.txt · Last modified: 2012/06/15 00:32 by chkuo