tutorials:perl_examples
Differences
This shows you the differences between two versions of the page.
Next revision | Previous revision | ||
tutorials:perl_examples [2010/07/31 01:05] – created chkuo | tutorials:perl_examples [2017/04/12 17:27] (current) – chkuo | ||
---|---|---|---|
Line 1: | Line 1: | ||
- | ====== Perl Examples | + | ====== Perl examples |
- | ===== Find Empty Files in a Directory ===== | + | * Find empty files in a directory: [[tutorials: |
- | < | + | * Rename files based on regex: [[tutorials: |
- | #!/usr/bin/perl -w | + | |
- | my $script_name = ' | + | * Generate command scripts to run blast+: [[tutorials: |
+ | * Execute command scripts: [[tutorials: | ||
+ | * Parse blast+ results, 1 hit per line: [[tutorials: | ||
+ | * Parse blast+ results, 1 HSP per line: [[tutorials: | ||
- | # Chih-Horng Kuo | + | * Unwrap sequences |
- | # find empty files in the in_dir (and possibly remove them) | + | * Find homopolymeric regions: [[tutorials: |
- | # v2 2010/07/12 | + | * Correct sequence orientation in a fasta file: [[tutorials: |
- | # style change | + | * Trim sequence based on lucy: [[tutorials: |
- | # v1 2007/05/25 | + | * Trim sequence based on regex: [[tutorials: |
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | |||
- | my $in_dir; | ||
- | my $regex; | ||
- | my $opt_rm; | ||
- | my $verbose; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | |||
- | my $count_in | ||
- | my $count_non_empty = 0; | ||
- | my $count_empty | ||
- | |||
- | # get list of in_files | ||
- | chdir $in_dir or die " | ||
- | my @in_files = glob " | ||
- | foreach my $in_file (@in_files) { | ||
- | $count_in++; | ||
- | if ( -z $in_file ) { | ||
- | $count_empty++; | ||
- | print " | ||
- | unlink $in_file if ($opt_rm); | ||
- | } | ||
- | else { | ||
- | $count_non_empty++; | ||
- | } | ||
- | } | ||
- | |||
- | print " | ||
- | print " | ||
- | print " | ||
- | |||
- | exit(0); | ||
- | </ | ||
- | |||
- | ===== Unwrap Sequences in Fasta Files ===== | ||
- | <code perl> | ||
- | # | ||
- | |||
- | my $script_name = ' | ||
- | |||
- | # Chih-Horng Kuo < | ||
- | # remove extra line-breaks (in the sequences) in fasta files | ||
- | # v1 2010/03/04 | ||
- | |||
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | use File:: | ||
- | |||
- | my $in_dir; | ||
- | my $in_file_ext; | ||
- | my $out_dir; | ||
- | my $out_file_ext; | ||
- | my $verbose; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | |||
- | system "mkdir -p $out_dir" | ||
- | $in_file_ext | ||
- | $out_file_ext = $out_file_ext ? $out_file_ext : ' | ||
- | |||
- | my $count_file = 0; | ||
- | opendir( DIR, $in_dir ) or die " | ||
- | while ( defined( my $in_file = readdir(DIR) ) ) { | ||
- | if ( $in_file =~ / | ||
- | my $file_id = $1; | ||
- | my $count_seq = 0; | ||
- | $count_file++; | ||
- | | ||
- | $in_file = $in_dir . $in_file; | ||
- | my $out_file = $out_dir . $file_id . ' | ||
- | open OUT, "> | ||
- | { | ||
- | # redefine the record separator | ||
- | local $/ = ">"; | ||
- | open IN, "< | ||
- | my $in_line = <IN>; # toss the first record, which only consists of ">" | ||
- | while ( $in_line = <IN> ) { | ||
- | chomp $in_line; | ||
- | my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); | ||
- | $seq =~ tr/ \t\n\r// | ||
- | $count_seq++; | ||
- | | ||
- | print OUT " | ||
- | } | ||
- | close IN; | ||
- | } | ||
- | close OUT; | ||
- | |||
- | if ($verbose) { | ||
- | print " | ||
- | } | ||
- | | ||
- | } | ||
- | } | ||
- | closedir(DIR); | ||
- | |||
- | |||
- | exit(0); | ||
- | </ | ||
- | |||
- | |||
- | ===== Find Homopolymeric Regions ===== | ||
- | <code perl> | ||
- | # | ||
- | |||
- | my $script_name = ' | ||
- | |||
- | # Chih-Horng Kuo < | ||
- | # read fasta file, find homopolymeric regions in the seqs | ||
- | # v3 2010/07/13 | ||
- | # bug fix: process the last base | ||
- | # | ||
- | # v2 2009/09/29 | ||
- | # | ||
- | # v1 2009/09/22 | ||
- | |||
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | use File:: | ||
- | |||
- | my $in_file; | ||
- | my $out_file; | ||
- | my $min; | ||
- | my $verbose; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | |||
- | $min = $min ? $min : ' | ||
- | |||
- | my $out_dir = dirname($out_file); | ||
- | system "mkdir -p $out_dir" | ||
- | |||
- | # read in_file | ||
- | my %seq_hash; | ||
- | my %length_hash; | ||
- | { | ||
- | open IN, "< | ||
- | |||
- | # redefine the record separator | ||
- | local $/ = ">"; | ||
- | my $in_line = < | ||
- | while ( $in_line = <IN> ) { | ||
- | chomp $in_line; | ||
- | my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); | ||
- | $seq =~ tr/ \t\n\r// | ||
- | $seq_hash{$seq_name} | ||
- | $length_hash{$seq_name} = length $seq; | ||
- | } | ||
- | close IN; | ||
- | } | ||
- | |||
- | open OUT, "> | ||
- | |||
- | my @seq_names = sort keys %seq_hash; | ||
- | my $count_seq = scalar @seq_names; | ||
- | |||
- | foreach my $seq_name (@seq_names) { | ||
- | my %end_hash; | ||
- | my %size_hash; | ||
- | my %base_hash; | ||
- | # pre-process the 1st base | ||
- | my @chars = split( //, $seq_hash{$seq_name} ); | ||
- | my $pre = shift @chars; | ||
- | my $start = 1; # set start | ||
- | my $size = 1; # set size | ||
- | my $position = 1; # set position | ||
- | foreach my $char (@chars) { | ||
- | # update position | ||
- | $position++; | ||
- | | ||
- | if ( $char eq $pre ) { | ||
- | # the current base is the same as the previous one | ||
- | $size++; | ||
- | } | ||
- | else { | ||
- | # the current base is different from the previous one | ||
- | # terminate the extension | ||
- | if ( $size >= $min ) { | ||
- | # if the size reaches the report threshold | ||
- | $end_hash{$start} | ||
- | $size_hash{$start} = $size; | ||
- | $base_hash{$start} = $pre; | ||
- | } | ||
- | else { | ||
- | # do nothing | ||
- | } | ||
- | |||
- | # reset | ||
- | $start = $position; | ||
- | $size = 1; | ||
- | } | ||
- | |||
- | # update $pre | ||
- | $pre = $char; | ||
- | } | ||
- | | ||
- | # process the last char | ||
- | if ( $size >= $min ) { | ||
- | # if the size reaches the report threshold | ||
- | $end_hash{$start} | ||
- | $size_hash{$start} = $size; | ||
- | $base_hash{$start} = $pre; | ||
- | } | ||
- | else { | ||
- | # do nothing | ||
- | } | ||
- | |||
- | # print location | ||
- | my $count = 0; | ||
- | foreach $start ( sort { $a <=> $b } keys %end_hash ) { | ||
- | $count++; | ||
- | print OUT " | ||
- | " | ||
- | " | ||
- | } | ||
- | |||
- | } | ||
- | close OUT; | ||
- | |||
- | if ($verbose) { | ||
- | } | ||
- | |||
- | if ($debug) { | ||
- | } | ||
- | |||
- | exit(0); | ||
- | </ | ||
- | |||
- | ===== Generate Command Scripts for Running Blast+ ===== | ||
- | <code perl> | ||
- | # | ||
- | |||
- | my $script_name = ' | ||
- | |||
- | # Chih-Horng Kuo | ||
- | # generate commands for running NCBI blast+ | ||
- | # v1 2010/07/13 | ||
- | |||
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | use File:: | ||
- | |||
- | my $exe; | ||
- | my $in_dir; | ||
- | my $out_dir; | ||
- | my $sh_dir; | ||
- | my $in_file_ext; | ||
- | my $out_file_ext; | ||
- | my $sh_prefix; | ||
- | my $opt; | ||
- | my $n_job; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | |||
- | $exe = $exe ? $exe : '/ | ||
- | $in_file_ext | ||
- | $out_file_ext = $out_file_ext ? $out_file_ext : ' | ||
- | $sh_prefix | ||
- | $n_job | ||
- | |||
- | system "mkdir -p $out_dir" | ||
- | system "mkdir -p $sh_dir" | ||
- | |||
- | my $count = 0; | ||
- | my %job_id_HoA; | ||
- | opendir( DIR, $in_dir ) or die " | ||
- | while ( defined( my $in_file = readdir(DIR) ) ) { | ||
- | if ( $in_file =~ / | ||
- | my $job_id = ( $count % $n_job ) + 1; | ||
- | push @{ $job_id_HoA{$job_id} }, $1; | ||
- | $count++; | ||
- | } | ||
- | } | ||
- | closedir(DIR); | ||
- | |||
- | # generate job .sh | ||
- | foreach my $job_id ( sort keys %job_id_HoA ) { | ||
- | my $sh_file = $sh_dir . $sh_prefix . $job_id . ' | ||
- | open OUT, "> | ||
- | |||
- | # shell | ||
- | print OUT '# | ||
- | |||
- | foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { | ||
- | my $in_file = $in_dir . $file_id . ' | ||
- | my $out_file = $out_dir . $file_id . ' | ||
- | |||
- | print OUT "$exe -query $in_file -out $out_file"; | ||
- | if ($opt) { | ||
- | print OUT " $opt"; | ||
- | } | ||
- | print OUT " | ||
- | } | ||
- | |||
- | close OUT; | ||
- | system "chmod +x $sh_file"; | ||
- | } | ||
- | |||
- | if ($debug) { | ||
- | } | ||
- | |||
- | exit(0); | ||
- | </ | ||
- | |||
- | ===== Execute Command Scripts ===== | ||
- | <code perl> | ||
- | # | ||
- | |||
- | my $script_name = ' | ||
- | |||
- | # Chih-Horng Kuo < | ||
- | # execute all .sh in the in_dir | ||
- | # v3 2010/02/04 | ||
- | # style change | ||
- | # v2 2009/06/18 | ||
- | # v1 2006/05/03 | ||
- | |||
- | use strict; | ||
- | use warnings; | ||
- | |||
- | use Getopt:: | ||
- | |||
- | my $in_dir; | ||
- | my $exe_dir; | ||
- | my $in_file_ext; | ||
- | my $batch_file_ext; | ||
- | my $log_file_ext; | ||
- | my $prefix; | ||
- | my $n_job; | ||
- | my $debug; | ||
- | |||
- | GetOptions( | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | " | ||
- | ); | ||
- | $prefix | ||
- | $in_file_ext | ||
- | $batch_file_ext = $batch_file_ext ? $batch_file_ext : ' | ||
- | $log_file_ext | ||
- | |||
- | system "mkdir -p $exe_dir" | ||
- | |||
- | my %job_id_HoA; | ||
- | my $count = 0; | ||
- | opendir( DIR, $in_dir ) or die " | ||
- | while ( defined( my $in_file = readdir(DIR) ) ) { | ||
- | if ( $in_file =~ / | ||
- | my $job_id = ( $count % $n_job ) + 1; | ||
- | push @{ $job_id_HoA{$job_id} }, $1; | ||
- | $count++; | ||
- | } | ||
- | } | ||
- | closedir(DIR); | ||
- | |||
- | foreach my $job_id ( sort keys %job_id_HoA ) { | ||
- | my $batch_file = $exe_dir . $prefix . $job_id . ' | ||
- | my $log_file | ||
- | open OUT, "> | ||
- | |||
- | # shell | ||
- | print OUT '# | ||
- | foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { | ||
- | print OUT " | ||
- | } | ||
- | |||
- | close OUT; | ||
- | system "chmod +x $batch_file"; | ||
- | system " | ||
- | print " | ||
- | } | ||
- | |||
- | exit(0); | ||
- | </ | ||
tutorials/perl_examples.1280509543.txt.gz · Last modified: 2010/07/31 01:05 by chkuo