Differences

This shows you the differences between two versions of the page.

--- tutorials:perl_examples [2010/08/03 02:20] – chkuo
+++ tutorials:perl_examples [2017/04/12 17:27] (current) – chkuo
@@ Line 1: / Line 1: @@
-====== Perl Examples ======
+====== Perl examples ======
-  * Find empty files in a directory: [[tutorial:perl:find_empty_file.2.pl]]
+  * Find empty files in a directory: [[tutorials:perl:find_empty_file.2.pl]]
-  * Unwrap sequences in fasta files: [[tutorial:perl:unwrap_seq_fasta.1.pl]]
+  * Rename files based on regex: [[tutorials:perl:rename_file_by_regex.3.pl]]
+  * Generate command scripts to run blast+: [[tutorials:perl:cmd_blast+.1.pl]]
+  * Execute command scripts: [[tutorials:perl:execute.3.pl]]
+  * Parse blast+ results, 1 hit per line: [[tutorials:perl:parse_blast_sim.6.pl]]
+  * Parse blast+ results, 1 HSP per line: [[tutorials:perl:parse_blast_hsp.4.pl]]
-===== Find Homopolymeric Regions =====
+  * Unwrap sequences in fasta files: [[tutorials:perl:unwrap_seq_fasta.1.pl]]
-<code perl>
+  * Find homopolymeric regions: [[tutorials:perl:find_homopolymer.3.pl]]
-#!/usr/bin/perl -w
+  * Correct sequence orientation in a fasta file: [[tutorials:perl:correct_fasta_orientation.1.pl]]
+  * Trim sequence based on lucy: [[tutorials:perl:trim_lucy.2.pl]]
+  * Trim sequence based on regex: [[tutorials:perl:trim_seq_by_regex.1.pl]]
-my $script_name = 'find_homopolymer.3.pl';
-# Chih-Horng Kuo <chkuo@lifedev.org>
-# read fasta file, find homopolymeric regions in the seqs
-# v3 2010/07/13
-#   bug fix: process the last base
-#   change output format
-# v2 2009/09/29
-#   report the position of homopolyers in GFF3 format
-# v1 2009/09/22
-use strict;
-use warnings;
-use Getopt::Long;
-use File::Basename;
-my $in_file;
-my $out_file;
-my $min;
-my $verbose;
-my $debug;
-GetOptions(
-    "in_file=s"      => \$in_file,
-    "out_file=s"     => \$out_file,
-    "min=i"          => \$min,
-    "verbose=i"      => \$verbose,
-    "debug=i"        => \$debug,
-);
-$min = $min ? $min : '5';
-my $out_dir = dirname($out_file);
-system "mkdir -p $out_dir" unless -e $out_dir;
-# read in_file
-my %seq_hash;       # key = seq_name, value = seq
-my %length_hash;    # key = seq_name, value = seq_length
-{
-    open IN, "<$in_file" or die "Can't open input file $in_file: $!\n";
-    # redefine the record separator
-    local $/ = ">";
-    my $in_line = <IN>;    # toss the first record, which only consists of ">"
-    while ( $in_line = <IN> ) {
-        chomp $in_line;
-        my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 );
-        $seq =~ tr/ \t\n\r//d;    # Remove whitespace
-        $seq_hash{$seq_name}    = uc $seq;       # convert seq to all upper case
-        $length_hash{$seq_name} = length $seq;
-    }
-    close IN;
-}
-open OUT, ">$out_file" or die "Can't open output file $out_file\n";
-my @seq_names = sort keys %seq_hash;
-my $count_seq = scalar @seq_names;
-foreach my $seq_name (@seq_names) {
-    my %end_hash;   # key = start, value = end
-    my %size_hash;  # key = start, value = size
-    my %base_hash;  # key = start, value = base
-    # pre-process the 1st base
-    my @chars = split( //, $seq_hash{$seq_name} );
-    my $pre   = shift @chars;
-    my $start = 1;    # set start
-    my $size  = 1;    # set size
-    my $position = 1; # set position
-    foreach my $char (@chars) {
-        # update position
-        $position++;
-        if ( $char eq $pre ) {
-            # the current base is the same as the previous one
-            $size++;
-        }
-        else {
-            # the current base is different from the previous one
-            # terminate the extension
-            if ( $size >= $min ) {
-                # if the size reaches the report threshold
-                $end_hash{$start}  = ( $position - 1 );
-                $size_hash{$start} = $size;
-                $base_hash{$start} = $pre;
-            }
-            else {
-                # do nothing
-            }
-            # reset
-            $start = $position;
-            $size  = 1;
-        }
-        # update $pre
-        $pre = $char;
-    }
-    # process the last char
-    if ( $size >= $min ) {
-        # if the size reaches the report threshold
-        $end_hash{$start}  = $position;
-        $size_hash{$start} = $size;
-        $base_hash{$start} = $pre;
-    }
-    else {
-        # do nothing
-    }
-    # print location
-    my $count = 0;
-    foreach $start ( sort { $a <=> $b } keys %end_hash ) {
-        $count++;
-        print OUT "$seq_name\t$count\t",
-          "$start\t$end_hash{$start}\t$size_hash{$start}\t",
-          "$base_hash{$start}\n";
-    }
-}
-close OUT;
-if ($verbose) {
-}
-if ($debug) {
-}
-exit(0);
-</code>
-===== Generate Command Scripts for Running Blast+ =====
-<code perl>
-#!/usr/bin/perl -w
-my $script_name = 'cmd_blast+.1.pl';
-# Chih-Horng Kuo
-# generate commands for running NCBI blast+
-# v1 2010/07/13
-use strict;
-use warnings;
-use Getopt::Long;
-use File::Basename;
-my $exe;
-my $in_dir;
-my $out_dir;
-my $sh_dir;
-my $in_file_ext;
-my $out_file_ext;
-my $sh_prefix;
-my $opt;
-my $n_job;
-my $debug;
-GetOptions(
-    "exe=s"          => \$exe,
-    "in_dir=s"       => \$in_dir,
-    "out_dir=s"      => \$out_dir,
-    "sh_dir=s"       => \$sh_dir,
-    "in_file_ext=s"  => \$in_file_ext,
-    "out_file_ext=s" => \$out_file_ext,
-    "sh_prefix=s"    => \$sh_prefix,
-    "opt=s"          => \$opt,
-    "n_job=i"        => \$n_job,
-    "debug=i"        => \$debug,
-);
-$exe          = $exe          ? $exe          : '/usr/local/blast+/bin/blastn';
-$in_file_ext  = $in_file_ext  ? $in_file_ext  : 'fasta';
-$out_file_ext = $out_file_ext ? $out_file_ext : 'blast';
-$sh_prefix    = $sh_prefix    ? $sh_prefix    : 'job';
-$n_job        = $n_job        ? $n_job        : '1';
-system "mkdir -p $out_dir" unless -e $out_dir;
-system "mkdir -p $sh_dir"  unless -e $sh_dir;
-my $count = 0;
-my %job_id_HoA;    # key = job_id, value = array of file_id
-opendir( DIR, $in_dir ) or die "can't open $in_dir: $!\n";
-while ( defined( my $in_file = readdir(DIR) ) ) {
-    if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) {
-        my $job_id = ( $count % $n_job ) + 1;
-        push @{ $job_id_HoA{$job_id} }, $1;
-        $count++;
-    }
-}
-closedir(DIR);
-# generate job .sh
-foreach my $job_id ( sort keys %job_id_HoA ) {
-    my $sh_file = $sh_dir . $sh_prefix . $job_id . '.sh';
-    open OUT, ">$sh_file" or die "Can't open output file $sh_file: $!\n";
-    # shell
-    print OUT '#!/bin/bash', "\n";
-    foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) {
-        my $in_file = $in_dir . $file_id . '.' . $in_file_ext;
-        my $out_file = $out_dir . $file_id . '.' . $out_file_ext;
-        print OUT "$exe -query $in_file -out $out_file";
-        if ($opt) {
-            print OUT " $opt";
-        }
-        print OUT "\n";
-    }
-    close OUT;
-    system "chmod +x $sh_file";
-}
-if ($debug) {
-}
-exit(0);
-</code>
-===== Execute Command Scripts =====
-<code perl>
-#!/usr/bin/perl -w
-my $script_name = 'execute.3.pl';
-# Chih-Horng Kuo <chkuo@lifedev.org>
-# execute all .sh in the in_dir
-# v3 2010/02/04
-#   style change
-# v2 2009/06/18
-# v1 2006/05/03
-use strict;
-use warnings;
-use Getopt::Long;
-my $in_dir;
-my $exe_dir;
-my $in_file_ext;
-my $batch_file_ext;
-my $log_file_ext;
-my $prefix;    # prefix of batch files
-my $n_job;     # split into n batch files
-my $debug;
-GetOptions(
-    "in_dir=s"         => \$in_dir,
-    "exe_dir=s"        => \$exe_dir,
-    "in_file_ext=s"    => \$in_file_ext,
-    "batch_file_ext=s" => \$batch_file_ext,
-    "log_file_ext=s"   => \$log_file_ext,
-    "prefix=s"         => \$prefix,
-    "n_job=i"          => \$n_job,
-    "debug=i"          => \$debug,
-);
-$prefix         = $prefix         ? $prefix         : 'job';
-$in_file_ext    = $in_file_ext    ? $in_file_ext    : 'sh';
-$batch_file_ext = $batch_file_ext ? $batch_file_ext : 'sh';
-$log_file_ext   = $log_file_ext   ? $log_file_ext   : 'log';
-system "mkdir -p $exe_dir" unless -e $exe_dir;
-my %job_id_HoA;    # key = job_id, value = array of file_id
-my $count = 0;
-opendir( DIR, $in_dir ) or die "can't open $in_dir: $!";
-while ( defined( my $in_file = readdir(DIR) ) ) {
-    if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) {
-        my $job_id = ( $count % $n_job ) + 1;
-        push @{ $job_id_HoA{$job_id} }, $1;
-        $count++;
-    }
-}
-closedir(DIR);
-foreach my $job_id ( sort keys %job_id_HoA ) {
-    my $batch_file = $exe_dir . $prefix . $job_id . '.' . $batch_file_ext;
-    my $log_file   = $exe_dir . $prefix . $job_id . '.' . $log_file_ext;
-    open OUT, ">$batch_file" or die "Can't open output file $batch_file: $!\n";
-    # shell
-    print OUT '#!/bin/bash', "\n";
-    foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) {
-        print OUT "$in_dir$file_id\.$in_file_ext\n";
-    }
-    close OUT;
-    system "chmod +x $batch_file";
-    system "$batch_file > $log_file 2>&1 &";
-    print "command: $batch_file > $log_file 2>&1 &\n";
-}
-exit(0);
-</code>