User Tools

Site Tools


tutorials:perl_examples

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revisionPrevious revision
Next revision
Previous revision
tutorials:perl_examples [2010/08/03 02:20] chkuotutorials:perl_examples [2017/04/12 17:27] (current) chkuo
Line 1: Line 1:
-====== Perl Examples ======+====== Perl examples ======
  
-  * Find empty files in a directory: [[tutorial:perl:find_empty_file.2.pl]] +  * Find empty files in a directory: [[tutorials:perl:find_empty_file.2.pl]] 
-  * Unwrap sequences in fasta files: [[tutorial:perl:unwrap_seq_fasta.1.pl]]+  * Rename files based on regex: [[tutorials:perl:rename_file_by_regex.3.pl]]
  
 +  * Generate command scripts to run blast+: [[tutorials:perl:cmd_blast+.1.pl]]
 +  * Execute command scripts: [[tutorials:perl:execute.3.pl]]
 +  * Parse blast+ results, 1 hit per line: [[tutorials:perl:parse_blast_sim.6.pl]]
 +  * Parse blast+ results, 1 HSP per line: [[tutorials:perl:parse_blast_hsp.4.pl]]
  
-===== Find Homopolymeric Regions ===== +  * Unwrap sequences in fasta files: [[tutorials:perl:unwrap_seq_fasta.1.pl]] 
-<code perl> +  * Find homopolymeric regions: [[tutorials:perl:find_homopolymer.3.pl]] 
-#!/usr/bin/perl -w+  * Correct sequence orientation in a fasta file: [[tutorials:perl:correct_fasta_orientation.1.pl]] 
 +  * Trim sequence based on lucy: [[tutorials:perl:trim_lucy.2.pl]] 
 +  * Trim sequence based on regex: [[tutorials:perl:trim_seq_by_regex.1.pl]]
  
-my $script_name = 'find_homopolymer.3.pl'; 
- 
-# Chih-Horng Kuo <chkuo@lifedev.org> 
-# read fasta file, find homopolymeric regions in the seqs 
-# v3 2010/07/13 
-#   bug fix: process the last base 
-#   change output format 
-# v2 2009/09/29 
-#   report the position of homopolyers in GFF3 format 
-# v1 2009/09/22 
- 
-use strict; 
-use warnings; 
- 
-use Getopt::Long; 
-use File::Basename; 
- 
-my $in_file; 
-my $out_file; 
-my $min; 
-my $verbose; 
-my $debug; 
- 
-GetOptions( 
-    "in_file=s"      => \$in_file, 
-    "out_file=s"     => \$out_file, 
-    "min=i"          => \$min, 
-    "verbose=i"      => \$verbose, 
-    "debug=i"        => \$debug, 
-); 
- 
-$min = $min ? $min : '5'; 
- 
-my $out_dir = dirname($out_file); 
-system "mkdir -p $out_dir" unless -e $out_dir; 
- 
-# read in_file 
-my %seq_hash;       # key = seq_name, value = seq 
-my %length_hash;    # key = seq_name, value = seq_length 
-{ 
-    open IN, "<$in_file" or die "Can't open input file $in_file: $!\n"; 
- 
-    # redefine the record separator 
-    local $/ = ">"; 
-    my $in_line = <IN>;    # toss the first record, which only consists of ">" 
-    while ( $in_line = <IN> ) { 
-        chomp $in_line; 
-        my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); 
-        $seq =~ tr/ \t\n\r//d;    # Remove whitespace 
-        $seq_hash{$seq_name}    = uc $seq;       # convert seq to all upper case 
-        $length_hash{$seq_name} = length $seq; 
-    } 
-    close IN; 
-} 
- 
-open OUT, ">$out_file" or die "Can't open output file $out_file\n"; 
- 
-my @seq_names = sort keys %seq_hash; 
-my $count_seq = scalar @seq_names; 
- 
-foreach my $seq_name (@seq_names) { 
-    my %end_hash;   # key = start, value = end 
-    my %size_hash;  # key = start, value = size 
-    my %base_hash;  # key = start, value = base 
-    # pre-process the 1st base 
-    my @chars = split( //, $seq_hash{$seq_name} ); 
-    my $pre   = shift @chars; 
-    my $start = 1;    # set start 
-    my $size  = 1;    # set size 
-    my $position = 1; # set position 
-    foreach my $char (@chars) { 
-        # update position 
-        $position++; 
-         
-        if ( $char eq $pre ) { 
-            # the current base is the same as the previous one 
-            $size++; 
-        } 
-        else { 
-            # the current base is different from the previous one 
-            # terminate the extension 
-            if ( $size >= $min ) { 
-                # if the size reaches the report threshold 
-                $end_hash{$start}  = ( $position - 1 ); 
-                $size_hash{$start} = $size; 
-                $base_hash{$start} = $pre; 
-            } 
-            else { 
-                # do nothing 
-            } 
- 
-            # reset  
-            $start = $position; 
-            $size  = 1; 
-        } 
- 
-        # update $pre 
-        $pre = $char; 
-    } 
-     
-    # process the last char 
-    if ( $size >= $min ) { 
-        # if the size reaches the report threshold 
-        $end_hash{$start}  = $position; 
-        $size_hash{$start} = $size; 
-        $base_hash{$start} = $pre; 
-    } 
-    else { 
-        # do nothing 
-    } 
- 
-    # print location 
-    my $count = 0; 
-    foreach $start ( sort { $a <=> $b } keys %end_hash ) { 
-        $count++; 
-        print OUT "$seq_name\t$count\t", 
-          "$start\t$end_hash{$start}\t$size_hash{$start}\t", 
-          "$base_hash{$start}\n"; 
-    } 
- 
-} 
-close OUT; 
- 
-if ($verbose) { 
-} 
- 
-if ($debug) { 
-} 
- 
-exit(0); 
-</code> 
- 
-===== Generate Command Scripts for Running Blast+ ===== 
-<code perl> 
-#!/usr/bin/perl -w 
- 
-my $script_name = 'cmd_blast+.1.pl'; 
- 
-# Chih-Horng Kuo  
-# generate commands for running NCBI blast+ 
-# v1 2010/07/13 
- 
-use strict; 
-use warnings; 
- 
-use Getopt::Long; 
-use File::Basename; 
- 
-my $exe; 
-my $in_dir; 
-my $out_dir; 
-my $sh_dir; 
-my $in_file_ext; 
-my $out_file_ext; 
-my $sh_prefix; 
-my $opt; 
-my $n_job; 
-my $debug; 
- 
-GetOptions( 
-    "exe=s"          => \$exe, 
-    "in_dir=s"       => \$in_dir, 
-    "out_dir=s"      => \$out_dir, 
-    "sh_dir=s"       => \$sh_dir, 
-    "in_file_ext=s"  => \$in_file_ext, 
-    "out_file_ext=s" => \$out_file_ext, 
-    "sh_prefix=s"    => \$sh_prefix, 
-    "opt=s"          => \$opt, 
-    "n_job=i"        => \$n_job, 
-    "debug=i"        => \$debug, 
-); 
- 
-$exe          = $exe          ? $exe          : '/usr/local/blast+/bin/blastn'; 
-$in_file_ext  = $in_file_ext  ? $in_file_ext  : 'fasta'; 
-$out_file_ext = $out_file_ext ? $out_file_ext : 'blast'; 
-$sh_prefix    = $sh_prefix    ? $sh_prefix    : 'job'; 
-$n_job        = $n_job        ? $n_job        : '1'; 
- 
-system "mkdir -p $out_dir" unless -e $out_dir; 
-system "mkdir -p $sh_dir"  unless -e $sh_dir; 
- 
-my $count = 0; 
-my %job_id_HoA;    # key = job_id, value = array of file_id 
-opendir( DIR, $in_dir ) or die "can't open $in_dir: $!\n"; 
-while ( defined( my $in_file = readdir(DIR) ) ) { 
-    if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) { 
-        my $job_id = ( $count % $n_job ) + 1; 
-        push @{ $job_id_HoA{$job_id} }, $1; 
-        $count++; 
-    } 
-} 
-closedir(DIR); 
- 
-# generate job .sh 
-foreach my $job_id ( sort keys %job_id_HoA ) { 
-    my $sh_file = $sh_dir . $sh_prefix . $job_id . '.sh'; 
-    open OUT, ">$sh_file" or die "Can't open output file $sh_file: $!\n"; 
- 
-    # shell 
-    print OUT '#!/bin/bash', "\n"; 
- 
-    foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { 
-        my $in_file = $in_dir . $file_id . '.' . $in_file_ext; 
-        my $out_file = $out_dir . $file_id . '.' . $out_file_ext; 
- 
-        print OUT "$exe -query $in_file -out $out_file"; 
-        if ($opt) { 
-            print OUT " $opt"; 
-        } 
-        print OUT "\n"; 
-    } 
- 
-    close OUT; 
-    system "chmod +x $sh_file"; 
-} 
- 
-if ($debug) { 
-} 
- 
-exit(0); 
-</code> 
- 
-===== Execute Command Scripts ===== 
-<code perl> 
-#!/usr/bin/perl -w 
- 
-my $script_name = 'execute.3.pl'; 
- 
-# Chih-Horng Kuo <chkuo@lifedev.org> 
-# execute all .sh in the in_dir 
-# v3 2010/02/04 
-#   style change 
-# v2 2009/06/18 
-# v1 2006/05/03 
- 
-use strict; 
-use warnings; 
- 
-use Getopt::Long; 
- 
-my $in_dir; 
-my $exe_dir; 
-my $in_file_ext; 
-my $batch_file_ext; 
-my $log_file_ext; 
-my $prefix;    # prefix of batch files 
-my $n_job;     # split into n batch files 
-my $debug; 
- 
-GetOptions( 
-    "in_dir=s"         => \$in_dir, 
-    "exe_dir=s"        => \$exe_dir, 
-    "in_file_ext=s"    => \$in_file_ext, 
-    "batch_file_ext=s" => \$batch_file_ext, 
-    "log_file_ext=s"   => \$log_file_ext, 
-    "prefix=s"         => \$prefix, 
-    "n_job=i"          => \$n_job, 
-    "debug=i"          => \$debug, 
-); 
-$prefix         = $prefix         ? $prefix         : 'job'; 
-$in_file_ext    = $in_file_ext    ? $in_file_ext    : 'sh'; 
-$batch_file_ext = $batch_file_ext ? $batch_file_ext : 'sh'; 
-$log_file_ext   = $log_file_ext   ? $log_file_ext   : 'log'; 
- 
-system "mkdir -p $exe_dir" unless -e $exe_dir; 
- 
-my %job_id_HoA;    # key = job_id, value = array of file_id 
-my $count = 0; 
-opendir( DIR, $in_dir ) or die "can't open $in_dir: $!"; 
-while ( defined( my $in_file = readdir(DIR) ) ) { 
-    if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) { 
-        my $job_id = ( $count % $n_job ) + 1; 
-        push @{ $job_id_HoA{$job_id} }, $1; 
-        $count++; 
-    } 
-} 
-closedir(DIR); 
- 
-foreach my $job_id ( sort keys %job_id_HoA ) { 
-    my $batch_file = $exe_dir . $prefix . $job_id . '.' . $batch_file_ext; 
-    my $log_file   = $exe_dir . $prefix . $job_id . '.' . $log_file_ext; 
-    open OUT, ">$batch_file" or die "Can't open output file $batch_file: $!\n"; 
- 
-    # shell 
-    print OUT '#!/bin/bash', "\n"; 
-    foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { 
-        print OUT "$in_dir$file_id\.$in_file_ext\n"; 
-    } 
- 
-    close OUT; 
-    system "chmod +x $batch_file"; 
-    system "$batch_file > $log_file 2>&1 &"; 
-    print "command: $batch_file > $log_file 2>&1 &\n"; 
-} 
- 
-exit(0); 
-</code> 
  
  
tutorials/perl_examples.1280773220.txt.gz · Last modified: 2010/08/03 02:20 by chkuo