tutorials:perl_examples
Differences
This shows you the differences between two versions of the page.
Both sides previous revisionPrevious revisionNext revision | Previous revision | ||
tutorials:perl_examples [2010/08/03 02:21] – chkuo | tutorials:perl_examples [2017/04/12 17:27] (current) – chkuo | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== Perl Examples | + | ====== Perl examples |
- | * Find empty files in a directory: [[tutorial: | + | * Find empty files in a directory: [[tutorials: |
- | * Unwrap sequences in fasta files: [[tutorial:perl:unwrap_seq_fasta.1.pl]] | + | * Rename |
- | * Find homopolymeric regions: [[tutorial: | + | |
- | <code perl> | + | |
- | # | + | |
- | my $script_name = ' | + | * Generate command scripts to run blast+: [[tutorials: |
+ | * Execute command scripts: [[tutorials: | ||
+ | * Parse blast+ results, 1 hit per line: [[tutorials: | ||
+ | * Parse blast+ results, 1 HSP per line: [[tutorials: | ||
- | # Chih-Horng Kuo < | + | * Unwrap sequences in fasta files: [[tutorials: |
- | # read fasta file, find homopolymeric regions | + | * Find homopolymeric regions: |
- | # v3 2010/ | + | * Correct sequence orientation |
- | # bug fix: process the last base | + | * Trim sequence based on lucy: [[tutorials: |
- | # | + | * Trim sequence based on regex: [[tutorials: |
- | # v2 2009/ | + | |
- | # | + | |
- | # v1 2009/09/22 | + | |
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | use File:: | ||
- | |||
- | my $in_file; | ||
- | my $out_file; | ||
- | my $min; | ||
- | my $verbose; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | |||
- | $min = $min ? $min : ' | ||
- | |||
- | my $out_dir = dirname($out_file); | ||
- | system "mkdir -p $out_dir" | ||
- | |||
- | # read in_file | ||
- | my %seq_hash; | ||
- | my %length_hash; | ||
- | { | ||
- | open IN, "< | ||
- | |||
- | # redefine the record separator | ||
- | local $/ = ">"; | ||
- | my $in_line = < | ||
- | while ( $in_line = <IN> ) { | ||
- | chomp $in_line; | ||
- | my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); | ||
- | $seq =~ tr/ \t\n\r// | ||
- | $seq_hash{$seq_name} | ||
- | $length_hash{$seq_name} = length $seq; | ||
- | } | ||
- | close IN; | ||
- | } | ||
- | |||
- | open OUT, "> | ||
- | |||
- | my @seq_names = sort keys %seq_hash; | ||
- | my $count_seq = scalar @seq_names; | ||
- | |||
- | foreach my $seq_name (@seq_names) { | ||
- | my %end_hash; | ||
- | my %size_hash; | ||
- | my %base_hash; | ||
- | # pre-process the 1st base | ||
- | my @chars = split( //, $seq_hash{$seq_name} ); | ||
- | my $pre = shift @chars; | ||
- | my $start = 1; # set start | ||
- | my $size = 1; # set size | ||
- | my $position = 1; # set position | ||
- | foreach my $char (@chars) { | ||
- | # update position | ||
- | $position++; | ||
- | | ||
- | if ( $char eq $pre ) { | ||
- | # the current base is the same as the previous one | ||
- | $size++; | ||
- | } | ||
- | else { | ||
- | # the current base is different from the previous one | ||
- | # terminate the extension | ||
- | if ( $size >= $min ) { | ||
- | # if the size reaches the report threshold | ||
- | $end_hash{$start} | ||
- | $size_hash{$start} = $size; | ||
- | $base_hash{$start} = $pre; | ||
- | } | ||
- | else { | ||
- | # do nothing | ||
- | } | ||
- | |||
- | # reset | ||
- | $start = $position; | ||
- | $size = 1; | ||
- | } | ||
- | |||
- | # update $pre | ||
- | $pre = $char; | ||
- | } | ||
- | | ||
- | # process the last char | ||
- | if ( $size >= $min ) { | ||
- | # if the size reaches the report threshold | ||
- | $end_hash{$start} | ||
- | $size_hash{$start} = $size; | ||
- | $base_hash{$start} = $pre; | ||
- | } | ||
- | else { | ||
- | # do nothing | ||
- | } | ||
- | |||
- | # print location | ||
- | my $count = 0; | ||
- | foreach $start ( sort { $a <=> $b } keys %end_hash ) { | ||
- | $count++; | ||
- | print OUT " | ||
- | " | ||
- | " | ||
- | } | ||
- | |||
- | } | ||
- | close OUT; | ||
- | |||
- | if ($verbose) { | ||
- | } | ||
- | |||
- | if ($debug) { | ||
- | } | ||
- | |||
- | exit(0); | ||
- | </ | ||
- | |||
- | ===== Generate Command Scripts for Running Blast+ ===== | ||
- | <code perl> | ||
- | # | ||
- | |||
- | my $script_name = ' | ||
- | |||
- | # Chih-Horng Kuo | ||
- | # generate commands for running NCBI blast+ | ||
- | # v1 2010/07/13 | ||
- | |||
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | use File:: | ||
- | |||
- | my $exe; | ||
- | my $in_dir; | ||
- | my $out_dir; | ||
- | my $sh_dir; | ||
- | my $in_file_ext; | ||
- | my $out_file_ext; | ||
- | my $sh_prefix; | ||
- | my $opt; | ||
- | my $n_job; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | |||
- | $exe = $exe ? $exe : '/ | ||
- | $in_file_ext | ||
- | $out_file_ext = $out_file_ext ? $out_file_ext : ' | ||
- | $sh_prefix | ||
- | $n_job | ||
- | |||
- | system "mkdir -p $out_dir" | ||
- | system "mkdir -p $sh_dir" | ||
- | |||
- | my $count = 0; | ||
- | my %job_id_HoA; | ||
- | opendir( DIR, $in_dir ) or die " | ||
- | while ( defined( my $in_file = readdir(DIR) ) ) { | ||
- | if ( $in_file =~ / | ||
- | my $job_id = ( $count % $n_job ) + 1; | ||
- | push @{ $job_id_HoA{$job_id} }, $1; | ||
- | $count++; | ||
- | } | ||
- | } | ||
- | closedir(DIR); | ||
- | |||
- | # generate job .sh | ||
- | foreach my $job_id ( sort keys %job_id_HoA ) { | ||
- | my $sh_file = $sh_dir . $sh_prefix . $job_id . ' | ||
- | open OUT, "> | ||
- | |||
- | # shell | ||
- | print OUT '# | ||
- | |||
- | foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { | ||
- | my $in_file = $in_dir . $file_id . ' | ||
- | my $out_file = $out_dir . $file_id . ' | ||
- | |||
- | print OUT "$exe -query $in_file -out $out_file"; | ||
- | if ($opt) { | ||
- | print OUT " $opt"; | ||
- | } | ||
- | print OUT " | ||
- | } | ||
- | |||
- | close OUT; | ||
- | system "chmod +x $sh_file"; | ||
- | } | ||
- | |||
- | if ($debug) { | ||
- | } | ||
- | |||
- | exit(0); | ||
- | </ | ||
- | |||
- | ===== Execute Command Scripts ===== | ||
- | <code perl> | ||
- | # | ||
- | |||
- | my $script_name = ' | ||
- | |||
- | # Chih-Horng Kuo < | ||
- | # execute all .sh in the in_dir | ||
- | # v3 2010/02/04 | ||
- | # style change | ||
- | # v2 2009/06/18 | ||
- | # v1 2006/05/03 | ||
- | |||
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | |||
- | my $in_dir; | ||
- | my $exe_dir; | ||
- | my $in_file_ext; | ||
- | my $batch_file_ext; | ||
- | my $log_file_ext; | ||
- | my $prefix; | ||
- | my $n_job; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | $prefix | ||
- | $in_file_ext | ||
- | $batch_file_ext = $batch_file_ext ? $batch_file_ext : ' | ||
- | $log_file_ext | ||
- | |||
- | system "mkdir -p $exe_dir" | ||
- | |||
- | my %job_id_HoA; | ||
- | my $count = 0; | ||
- | opendir( DIR, $in_dir ) or die " | ||
- | while ( defined( my $in_file = readdir(DIR) ) ) { | ||
- | if ( $in_file =~ / | ||
- | my $job_id = ( $count % $n_job ) + 1; | ||
- | push @{ $job_id_HoA{$job_id} }, $1; | ||
- | $count++; | ||
- | } | ||
- | } | ||
- | closedir(DIR); | ||
- | |||
- | foreach my $job_id ( sort keys %job_id_HoA ) { | ||
- | my $batch_file = $exe_dir . $prefix . $job_id . ' | ||
- | my $log_file | ||
- | open OUT, "> | ||
- | |||
- | # shell | ||
- | print OUT '# | ||
- | foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { | ||
- | print OUT " | ||
- | } | ||
- | |||
- | close OUT; | ||
- | system "chmod +x $batch_file"; | ||
- | system " | ||
- | print " | ||
- | } | ||
- | |||
- | exit(0); | ||
- | </ | ||
tutorials/perl_examples.1280773271.txt.gz · Last modified: 2010/08/03 02:21 by chkuo