tutorials:perl:trim_seq_by_regex.1.pl
Usage:
perl ~/plscript/trim_seq_by_regex.1.pl \ --in_file=INPUT_FILE \ --out_dir=OUTPUT_DIR \ --regex="AAACT[CT]AAA[GT]GAATTGACGG(\w+)$" \ --verbose=1
- trim_seq_by_regex.1.pl
#!/usr/bin/perl -w my $script_name = 'trim_seq_by_regex.1.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # trim seqs in a fasta file by provided regex # v1 2011/01/13 use strict; use warnings; use Getopt::Long; use File::Basename; my $in_file; my $out_dir; my $regex; my $verbose; my $debug; GetOptions( "in_file=s" => \$in_file, "out_dir=s" => \$out_dir, "regex=s" => \$regex, "verbose=i" => \$verbose, "debug=i" => \$debug, ); system "mkdir -p $out_dir" unless -e $out_dir; # read in_file my %seq_hash; #key = seq_name, value = seq { open IN, "<$in_file" or die "Can't open input file $in_file: $!\n"; # redefine the record separator local $/ = ">"; my $in_line = <IN>; # toss the first record, which only consists of ">" while ( my $in_line = <IN> ) { chomp $in_line; next unless $in_line; my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); $seq =~ tr/ \t\n\r//d; # remove whitespace $seq_hash{$seq_name} = $seq; } close IN; } my $count_in = scalar keys %seq_hash; my $out_trimmed = $out_dir . 'trimmed.fasta'; my $out_untrimmed = $out_dir . 'untrimmed.fasta'; my $count_trimmed = 0; my $count_untrimmed = 0; open OUT_TRIMMED, ">$out_trimmed" or die "Can't open output file $out_trimmed: $!\n"; open OUT_UNTRIMMED, ">$out_untrimmed" or die "Can't open output file $out_untrimmed: $!\n"; foreach my $seq_name ( sort keys %seq_hash ) { my $seq = $seq_hash{$seq_name}; if ( $seq =~ m/$regex/ ) { my $trimmed_seq = $1; print OUT_TRIMMED ">$seq_name\n$trimmed_seq\n"; $count_trimmed++; } else { print OUT_UNTRIMMED ">$seq_name\n$seq\n"; $count_untrimmed++; } } close OUT_UNTRIMMED; close OUT_TRIMMED; if ($verbose) { print "count_in = $count_in\n"; print "count_trimmed = $count_trimmed ("; printf "%.3f", ( $count_trimmed / $count_in ); print ")\n"; print "count_untrimmed = $count_untrimmed ("; printf "%.3f", ( $count_untrimmed / $count_in ); print ")\n"; } if ($debug) { } exit(0);
tutorials/perl/trim_seq_by_regex.1.pl.txt · Last modified: 2012/06/15 00:34 by chkuo