Kuo Lab Wiki

find_homopolymer.3.pl
#!/usr/bin/perl -w
 
my $script_name = 'find_homopolymer.3.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# read fasta file, find homopolymeric regions in the seqs
# v3 2010/07/13
#   bug fix: process the last base
#   change output format
# v2 2009/09/29
#   report the position of homopolyers in GFF3 format
# v1 2009/09/22
 
use strict;
use warnings;
 
use Getopt::Long;
use File::Basename;
 
my $in_file;
my $out_file;
my $min;
my $verbose;
my $debug;
 
GetOptions(
    "in_file=s"      => \$in_file,
    "out_file=s"     => \$out_file,
    "min=i"          => \$min,
    "verbose=i"      => \$verbose,
    "debug=i"        => \$debug,
);
 
$min = $min ? $min : '5';
 
my $out_dir = dirname($out_file);
system "mkdir -p $out_dir" unless -e $out_dir;
 
# read in_file
my %seq_hash;       # key = seq_name, value = seq
my %length_hash;    # key = seq_name, value = seq_length
{
    open IN, "<$in_file" or die "Can't open input file $in_file: $!\n";
 
    # redefine the record separator
    local $/ = ">";
    my $in_line = <IN>;    # toss the first record, which only consists of ">"
    while ( $in_line = <IN> ) {
        chomp $in_line;
        my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 );
        $seq =~ tr/ \t\n\r//d;    # Remove whitespace
        $seq_hash{$seq_name}    = uc $seq;       # convert seq to all upper case
        $length_hash{$seq_name} = length $seq;
    }
    close IN;
}
 
open OUT, ">$out_file" or die "Can't open output file $out_file\n";
 
my @seq_names = sort keys %seq_hash;
my $count_seq = scalar @seq_names;
 
foreach my $seq_name (@seq_names) {
    my %end_hash;   # key = start, value = end
    my %size_hash;  # key = start, value = size
    my %base_hash;  # key = start, value = base
    # pre-process the 1st base
    my @chars = split( //, $seq_hash{$seq_name} );
    my $pre   = shift @chars;
    my $start = 1;    # set start
    my $size  = 1;    # set size
    my $position = 1; # set position
    foreach my $char (@chars) {
        # update position
        $position++;
 
        if ( $char eq $pre ) {
            # the current base is the same as the previous one
            $size++;
        }
        else {
            # the current base is different from the previous one
            # terminate the extension
            if ( $size >= $min ) {
                # if the size reaches the report threshold
                $end_hash{$start}  = ( $position - 1 );
                $size_hash{$start} = $size;
                $base_hash{$start} = $pre;
            }
            else {
                # do nothing
            }
 
            # reset 
            $start = $position;
            $size  = 1;
        }
 
        # update $pre
        $pre = $char;
    }
 
    # process the last char
    if ( $size >= $min ) {
        # if the size reaches the report threshold
        $end_hash{$start}  = $position;
        $size_hash{$start} = $size;
        $base_hash{$start} = $pre;
    }
    else {
        # do nothing
    }
 
    # print location
    my $count = 0;
    foreach $start ( sort { $a <=> $b } keys %end_hash ) {
        $count++;
        print OUT "$seq_name\t$count\t",
          "$start\t$end_hash{$start}\t$size_hash{$start}\t",
          "$base_hash{$start}\n";
    }
 
}
close OUT;
 
if ($verbose) {
}
 
if ($debug) {
}
 
exit(0);