tutorials:perl_examples
This is an old revision of the document!
Table of Contents
Perl Examples
- Find Empty Files in a Directory: find_empty_file.2.pl
#!/usr/bin/perl -w my $script_name = 'find_empty_file.2.pl'; # Chih-Horng Kuo # find empty files in the in_dir (and possibly remove them) # v2 2010/07/12 # style change # v1 2007/05/25 use strict; use warnings; use Getopt::Long; my $in_dir; my $regex; my $opt_rm; my $verbose; my $debug; GetOptions( "in_dir=s" => \$in_dir, "regex=s" => \$regex, "opt_rm=i" => \$opt_rm, "verbose=i" => \$verbose, "debug=i" => \$debug, ); my $count_in = 0; my $count_non_empty = 0; my $count_empty = 0; # get list of in_files chdir $in_dir or die "cannot chdir to $in_dir: $!"; my @in_files = glob "$regex"; foreach my $in_file (@in_files) { $count_in++; if ( -z $in_file ) { $count_empty++; print "$in_dir$in_file\n" if ($verbose); unlink $in_file if ($opt_rm); } else { $count_non_empty++; } } print "count_in = $count_in\n"; print "count_non_empty = $count_non_empty\n"; print "count_empty = $count_empty\n"; exit(0);
Unwrap Sequences in Fasta Files
#!/usr/bin/perl -w my $script_name = 'unwrap_seq_fasta.1.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # remove extra line-breaks (in the sequences) in fasta files # v1 2010/03/04 use strict; use warnings; use Getopt::Long; use File::Basename; my $in_dir; my $in_file_ext; my $out_dir; my $out_file_ext; my $verbose; my $debug; GetOptions( "in_dir=s" => \$in_dir, "in_file_ext=s" => \$in_file_ext, "out_dir=s" => \$out_dir, "out_file_ext=s" => \$out_file_ext, "verbose=i" => \$verbose, "debug=i" => \$debug, ); system "mkdir -p $out_dir" unless -e $out_dir; $in_file_ext = $in_file_ext ? $in_file_ext : 'fasta'; $out_file_ext = $out_file_ext ? $out_file_ext : 'fasta'; my $count_file = 0; opendir( DIR, $in_dir ) or die "can't open $in_dir: $!\n"; while ( defined( my $in_file = readdir(DIR) ) ) { if ( $in_file =~ /^(\S+)\.$in_file_ext$/ ) { my $file_id = $1; my $count_seq = 0; $count_file++; $in_file = $in_dir . $in_file; my $out_file = $out_dir . $file_id . '.' . $out_file_ext; open OUT, ">$out_file" or die "Can't open output file $out_file\n"; { # redefine the record separator local $/ = ">"; open IN, "<$in_file" or die "Can't open input file $in_file: $!\n"; my $in_line = <IN>; # toss the first record, which only consists of ">" while ( $in_line = <IN> ) { chomp $in_line; my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); $seq =~ tr/ \t\n\r//d; # Remove whitespace $count_seq++; print OUT "\>$seq_name\n$seq\n"; } close IN; } close OUT; if ($verbose) { print "file_id = $file_id, count_seq = $count_seq\n"; } } } closedir(DIR); exit(0);
Find Homopolymeric Regions
#!/usr/bin/perl -w my $script_name = 'find_homopolymer.3.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # read fasta file, find homopolymeric regions in the seqs # v3 2010/07/13 # bug fix: process the last base # change output format # v2 2009/09/29 # report the position of homopolyers in GFF3 format # v1 2009/09/22 use strict; use warnings; use Getopt::Long; use File::Basename; my $in_file; my $out_file; my $min; my $verbose; my $debug; GetOptions( "in_file=s" => \$in_file, "out_file=s" => \$out_file, "min=i" => \$min, "verbose=i" => \$verbose, "debug=i" => \$debug, ); $min = $min ? $min : '5'; my $out_dir = dirname($out_file); system "mkdir -p $out_dir" unless -e $out_dir; # read in_file my %seq_hash; # key = seq_name, value = seq my %length_hash; # key = seq_name, value = seq_length { open IN, "<$in_file" or die "Can't open input file $in_file: $!\n"; # redefine the record separator local $/ = ">"; my $in_line = <IN>; # toss the first record, which only consists of ">" while ( $in_line = <IN> ) { chomp $in_line; my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); $seq =~ tr/ \t\n\r//d; # Remove whitespace $seq_hash{$seq_name} = uc $seq; # convert seq to all upper case $length_hash{$seq_name} = length $seq; } close IN; } open OUT, ">$out_file" or die "Can't open output file $out_file\n"; my @seq_names = sort keys %seq_hash; my $count_seq = scalar @seq_names; foreach my $seq_name (@seq_names) { my %end_hash; # key = start, value = end my %size_hash; # key = start, value = size my %base_hash; # key = start, value = base # pre-process the 1st base my @chars = split( //, $seq_hash{$seq_name} ); my $pre = shift @chars; my $start = 1; # set start my $size = 1; # set size my $position = 1; # set position foreach my $char (@chars) { # update position $position++; if ( $char eq $pre ) { # the current base is the same as the previous one $size++; } else { # the current base is different from the previous one # terminate the extension if ( $size >= $min ) { # if the size reaches the report threshold $end_hash{$start} = ( $position - 1 ); $size_hash{$start} = $size; $base_hash{$start} = $pre; } else { # do nothing } # reset $start = $position; $size = 1; } # update $pre $pre = $char; } # process the last char if ( $size >= $min ) { # if the size reaches the report threshold $end_hash{$start} = $position; $size_hash{$start} = $size; $base_hash{$start} = $pre; } else { # do nothing } # print location my $count = 0; foreach $start ( sort { $a <=> $b } keys %end_hash ) { $count++; print OUT "$seq_name\t$count\t", "$start\t$end_hash{$start}\t$size_hash{$start}\t", "$base_hash{$start}\n"; } } close OUT; if ($verbose) { } if ($debug) { } exit(0);
Generate Command Scripts for Running Blast+
#!/usr/bin/perl -w my $script_name = 'cmd_blast+.1.pl'; # Chih-Horng Kuo # generate commands for running NCBI blast+ # v1 2010/07/13 use strict; use warnings; use Getopt::Long; use File::Basename; my $exe; my $in_dir; my $out_dir; my $sh_dir; my $in_file_ext; my $out_file_ext; my $sh_prefix; my $opt; my $n_job; my $debug; GetOptions( "exe=s" => \$exe, "in_dir=s" => \$in_dir, "out_dir=s" => \$out_dir, "sh_dir=s" => \$sh_dir, "in_file_ext=s" => \$in_file_ext, "out_file_ext=s" => \$out_file_ext, "sh_prefix=s" => \$sh_prefix, "opt=s" => \$opt, "n_job=i" => \$n_job, "debug=i" => \$debug, ); $exe = $exe ? $exe : '/usr/local/blast+/bin/blastn'; $in_file_ext = $in_file_ext ? $in_file_ext : 'fasta'; $out_file_ext = $out_file_ext ? $out_file_ext : 'blast'; $sh_prefix = $sh_prefix ? $sh_prefix : 'job'; $n_job = $n_job ? $n_job : '1'; system "mkdir -p $out_dir" unless -e $out_dir; system "mkdir -p $sh_dir" unless -e $sh_dir; my $count = 0; my %job_id_HoA; # key = job_id, value = array of file_id opendir( DIR, $in_dir ) or die "can't open $in_dir: $!\n"; while ( defined( my $in_file = readdir(DIR) ) ) { if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) { my $job_id = ( $count % $n_job ) + 1; push @{ $job_id_HoA{$job_id} }, $1; $count++; } } closedir(DIR); # generate job .sh foreach my $job_id ( sort keys %job_id_HoA ) { my $sh_file = $sh_dir . $sh_prefix . $job_id . '.sh'; open OUT, ">$sh_file" or die "Can't open output file $sh_file: $!\n"; # shell print OUT '#!/bin/bash', "\n"; foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { my $in_file = $in_dir . $file_id . '.' . $in_file_ext; my $out_file = $out_dir . $file_id . '.' . $out_file_ext; print OUT "$exe -query $in_file -out $out_file"; if ($opt) { print OUT " $opt"; } print OUT "\n"; } close OUT; system "chmod +x $sh_file"; } if ($debug) { } exit(0);
Execute Command Scripts
#!/usr/bin/perl -w my $script_name = 'execute.3.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # execute all .sh in the in_dir # v3 2010/02/04 # style change # v2 2009/06/18 # v1 2006/05/03 use strict; use warnings; use Getopt::Long; my $in_dir; my $exe_dir; my $in_file_ext; my $batch_file_ext; my $log_file_ext; my $prefix; # prefix of batch files my $n_job; # split into n batch files my $debug; GetOptions( "in_dir=s" => \$in_dir, "exe_dir=s" => \$exe_dir, "in_file_ext=s" => \$in_file_ext, "batch_file_ext=s" => \$batch_file_ext, "log_file_ext=s" => \$log_file_ext, "prefix=s" => \$prefix, "n_job=i" => \$n_job, "debug=i" => \$debug, ); $prefix = $prefix ? $prefix : 'job'; $in_file_ext = $in_file_ext ? $in_file_ext : 'sh'; $batch_file_ext = $batch_file_ext ? $batch_file_ext : 'sh'; $log_file_ext = $log_file_ext ? $log_file_ext : 'log'; system "mkdir -p $exe_dir" unless -e $exe_dir; my %job_id_HoA; # key = job_id, value = array of file_id my $count = 0; opendir( DIR, $in_dir ) or die "can't open $in_dir: $!"; while ( defined( my $in_file = readdir(DIR) ) ) { if ( $in_file =~ /(\S+)\.$in_file_ext$/ ) { my $job_id = ( $count % $n_job ) + 1; push @{ $job_id_HoA{$job_id} }, $1; $count++; } } closedir(DIR); foreach my $job_id ( sort keys %job_id_HoA ) { my $batch_file = $exe_dir . $prefix . $job_id . '.' . $batch_file_ext; my $log_file = $exe_dir . $prefix . $job_id . '.' . $log_file_ext; open OUT, ">$batch_file" or die "Can't open output file $batch_file: $!\n"; # shell print OUT '#!/bin/bash', "\n"; foreach my $file_id ( @{ $job_id_HoA{$job_id} } ) { print OUT "$in_dir$file_id\.$in_file_ext\n"; } close OUT; system "chmod +x $batch_file"; system "$batch_file > $log_file 2>&1 &"; print "command: $batch_file > $log_file 2>&1 &\n"; } exit(0);
tutorials/perl_examples.1280773064.txt.gz · Last modified: 2010/08/03 02:17 by chkuo