User Tools

Site Tools


tutorials:perl_examples

This is an old revision of the document!


Perl Examples

Unwrap Sequences in Fasta Files

#!/usr/bin/perl -w
 
my $script_name = 'unwrap_seq_fasta.1.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# remove extra line-breaks (in the sequences) in fasta files
# v1 2010/03/04
 
use strict;
use warnings;
 
use Getopt::Long;
use File::Basename;
 
my $in_dir;
my $in_file_ext;
my $out_dir;
my $out_file_ext;
my $verbose;
my $debug;
 
GetOptions(
    "in_dir=s"            => \$in_dir,
    "in_file_ext=s"       => \$in_file_ext,
    "out_dir=s"           => \$out_dir,
    "out_file_ext=s"      => \$out_file_ext,
    "verbose=i"           => \$verbose,
    "debug=i"             => \$debug,
);
 
system "mkdir -p $out_dir" unless -e $out_dir;
$in_file_ext  = $in_file_ext  ? $in_file_ext  : 'fasta';
$out_file_ext = $out_file_ext ? $out_file_ext : 'fasta';
 
my $count_file = 0;
opendir( DIR, $in_dir ) or die "can't open $in_dir: $!\n";
while ( defined( my $in_file = readdir(DIR) ) ) {
    if ( $in_file =~ /^(\S+)\.$in_file_ext$/ ) {
        my $file_id = $1;
        my $count_seq = 0;
        $count_file++;
 
        $in_file = $in_dir . $in_file;
        my $out_file = $out_dir . $file_id . '.' . $out_file_ext;
        open OUT, ">$out_file" or die "Can't open output file $out_file\n";
        {
            # redefine the record separator
            local $/ = ">";
            open IN, "<$in_file" or die "Can't open input file $in_file: $!\n";
            my $in_line = <IN>; # toss the first record, which only consists of ">"
            while ( $in_line = <IN> ) {
                chomp $in_line;
                my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 );
                $seq =~ tr/ \t\n\r//d;    # Remove whitespace
                $count_seq++;
 
                print OUT "\>$seq_name\n$seq\n";
            }
            close IN;
        }
        close OUT;
 
        if ($verbose) {
            print "file_id = $file_id, count_seq = $count_seq\n";
        }
 
    }
}
closedir(DIR);
 
 
exit(0);

Find Homopolymeric Regions

#!/usr/bin/perl -w
 
my $script_name = 'find_homopolymer.3.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# read fasta file, find homopolymeric regions in the seqs
# v3 2010/07/13
#   bug fix: process the last base
#   change output format
# v2 2009/09/29
#   report the position of homopolyers in GFF3 format
# v1 2009/09/22
 
use strict;
use warnings;
 
use Getopt::Long;
use File::Basename;
 
my $in_file;
my $out_file;
my $min;
my $verbose;
my $debug;
 
GetOptions(
    "in_file=s"      => \$in_file,
    "out_file=s"     => \$out_file,
    "min=i"          => \$min,
    "verbose=i"      => \$verbose,
    "debug=i"        => \$debug,
);
 
$min = $min ? $min : '5';
 
my $out_dir = dirname($out_file);
system "mkdir -p $out_dir" unless -e $out_dir;
 
# read in_file
my %seq_hash;       # key = seq_name, value = seq
my %length_hash;    # key = seq_name, value = seq_length
{
    open IN, "<$in_file" or die "Can't open input file $in_file: $!\n";
 
    # redefine the record separator
    local $/ = ">";
    my $in_line = <IN>;    # toss the first record, which only consists of ">"
    while ( $in_line = <IN> ) {
        chomp $in_line;
        my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 );
        $seq =~ tr/ \t\n\r//d;    # Remove whitespace
        $seq_hash{$seq_name}    = uc $seq;       # convert seq to all upper case
        $length_hash{$seq_name} = length $seq;
    }
    close IN;
}
 
open OUT, ">$out_file" or die "Can't open output file $out_file\n";
 
my @seq_names = sort keys %seq_hash;
my $count_seq = scalar @seq_names;
 
foreach my $seq_name (@seq_names) {
    my %end_hash;   # key = start, value = end
    my %size_hash;  # key = start, value = size
    my %base_hash;  # key = start, value = base
    # pre-process the 1st base
    my @chars = split( //, $seq_hash{$seq_name} );
    my $pre   = shift @chars;
    my $start = 1;    # set start
    my $size  = 1;    # set size
    my $position = 1; # set position
    foreach my $char (@chars) {
        # update position
        $position++;
 
        if ( $char eq $pre ) {
            # the current base is the same as the previous one
            $size++;
        }
        else {
            # the current base is different from the previous one
            # terminate the extension
            if ( $size >= $min ) {
                # if the size reaches the report threshold
                $end_hash{$start}  = ( $position - 1 );
                $size_hash{$start} = $size;
                $base_hash{$start} = $pre;
            }
            else {
                # do nothing
            }
 
            # reset 
            $start = $position;
            $size  = 1;
        }
 
        # update $pre
        $pre = $char;
    }
 
    # process the last char
    if ( $size >= $min ) {
        # if the size reaches the report threshold
        $end_hash{$start}  = $position;
        $size_hash{$start} = $size;
        $base_hash{$start} = $pre;
    }
    else {
        # do nothing
    }
 
    # print location
    my $count = 0;
    foreach $start ( sort { $a <=> $b } keys %end_hash ) {
        $count++;
        print OUT "$seq_name\t$count\t",
          "$start\t$end_hash{$start}\t$size_hash{$start}\t",
          "$base_hash{$start}\n";
    }
 
}
close OUT;
 
if ($verbose) {
}
 
if ($debug) {
}
 
exit(0);

Generate Command Scripts for Running Blast+

#!/usr/bin/perl -w
 
my $script_name = 'cmd_blast+.1.pl';
 
# Chih-Horng Kuo 
# generate commands for running NCBI blast+
# v1 2010/07/13
 
use strict;
use warnings;
 
use Getopt::Long;
use File::Basename;
 
my $exe;
my $in_dir;
my $out_dir;
my $sh_dir;
my $in_file_ext;
my $out_file_ext;
my $sh_prefix;
my $opt;
my $n_job;
my $debug;
 
GetOptions(
    "exe=s"          => \$exe,
    "in_dir=s"       => \$in_dir,
    "out_dir=s"      => \$out_dir,
    "sh_dir=s"       => \$sh_dir,
    "in_file_ext=s"  => \$in_file_ext,
    "out_file_ext=s" => \$out_file_ext,
    "sh_prefix=s"    => \$sh_prefix,
    "opt=s"          => \$opt,
    "n_job=i"        => \$n_job,
    "debug=i"        => \$debug,
);
 
$exe          = $exe          ? $exe          : '/usr/local/blast+/bin/blastn';
$in_file_ext  = $in_file_ext  ? $in_file_ext  : 'fasta';
$out_file_ext = $out_file_ext ? $out_file_ext : 'blast';
$sh_prefix    = $sh_prefix    ? $sh_prefix    : 'job';
$n_job        = $n_job        ? $n_job        : '1';
 
system "mkdir -p $out_dir" unless -e $out_dir;
system "mkdir -p $sh_dir"  unless -e $sh_dir;
 
my $count = 0;
my %job_id_HoA;    # key = job_id, value = array of file_id
opendir( DIR, $in_dir ) or die "can't open $in_dir: $!\n";
while ( defined( my $in_file = readdir(DIR) ) ) {
    if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) {
        my $job_id = ( $count % $n_job ) + 1;
        push @{ $job_id_HoA{$job_id} }, $1;
        $count++;
    }
}
closedir(DIR);
 
# generate job .sh
foreach my $job_id ( sort keys %job_id_HoA ) {
    my $sh_file = $sh_dir . $sh_prefix . $job_id . '.sh';
    open OUT, ">$sh_file" or die "Can't open output file $sh_file: $!\n";
 
    # shell
    print OUT '#!/bin/bash', "\n";
 
    foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) {
        my $in_file = $in_dir . $file_id . '.' . $in_file_ext;
        my $out_file = $out_dir . $file_id . '.' . $out_file_ext;
 
        print OUT "$exe -query $in_file -out $out_file";
        if ($opt) {
            print OUT " $opt";
        }
        print OUT "\n";
    }
 
    close OUT;
    system "chmod +x $sh_file";
}
 
if ($debug) {
}
 
exit(0);

Execute Command Scripts

#!/usr/bin/perl -w
 
my $script_name = 'execute.3.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# execute all .sh in the in_dir
# v3 2010/02/04
#   style change
# v2 2009/06/18
# v1 2006/05/03
 
use strict;
use warnings;
 
use Getopt::Long;
 
my $in_dir;
my $exe_dir;
my $in_file_ext;
my $batch_file_ext;
my $log_file_ext;
my $prefix;    # prefix of batch files
my $n_job;     # split into n batch files
my $debug;
 
GetOptions(
    "in_dir=s"         => \$in_dir,
    "exe_dir=s"        => \$exe_dir,
    "in_file_ext=s"    => \$in_file_ext,
    "batch_file_ext=s" => \$batch_file_ext,
    "log_file_ext=s"   => \$log_file_ext,
    "prefix=s"         => \$prefix,
    "n_job=i"          => \$n_job,
    "debug=i"          => \$debug,
);
$prefix         = $prefix         ? $prefix         : 'job';
$in_file_ext    = $in_file_ext    ? $in_file_ext    : 'sh';
$batch_file_ext = $batch_file_ext ? $batch_file_ext : 'sh';
$log_file_ext   = $log_file_ext   ? $log_file_ext   : 'log';
 
system "mkdir -p $exe_dir" unless -e $exe_dir;
 
my %job_id_HoA;    # key = job_id, value = array of file_id
my $count = 0;
opendir( DIR, $in_dir ) or die "can't open $in_dir: $!";
while ( defined( my $in_file = readdir(DIR) ) ) {
    if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) {
        my $job_id = ( $count % $n_job ) + 1;
        push @{ $job_id_HoA{$job_id} }, $1;
        $count++;
    }
}
closedir(DIR);
 
foreach my $job_id ( sort keys %job_id_HoA ) {
    my $batch_file = $exe_dir . $prefix . $job_id . '.' . $batch_file_ext;
    my $log_file   = $exe_dir . $prefix . $job_id . '.' . $log_file_ext;
    open OUT, ">$batch_file" or die "Can't open output file $batch_file: $!\n";
 
    # shell
    print OUT '#!/bin/bash', "\n";
    foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) {
        print OUT "$in_dir$file_id\.$in_file_ext\n";
    }
 
    close OUT;
    system "chmod +x $batch_file";
    system "$batch_file > $log_file 2>&1 &";
    print "command: $batch_file > $log_file 2>&1 &\n";
}
 
exit(0);
tutorials/perl_examples.1280773104.txt.gz · Last modified: 2010/08/03 02:18 by chkuo