Kuo Lab Wiki

Usage:

perl ~/plscript/trim_seq_by_regex.1.pl \
  --in_file=INPUT_FILE \
  --out_dir=OUTPUT_DIR \
  --regex="AAACT[CT]AAA[GT]GAATTGACGG(\w+)$" \
  --verbose=1

trim_seq_by_regex.1.pl

#!/usr/bin/perl -w
 
my $script_name = 'trim_seq_by_regex.1.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# trim seqs in a fasta file by provided regex 
# v1 2011/01/13
 
use strict;
use warnings;
 
use Getopt::Long;
use File::Basename;
 
 
my $in_file;
my $out_dir;
my $regex;
my $verbose;
my $debug;
 
GetOptions(
    "in_file=s" => \$in_file,
    "out_dir=s" => \$out_dir,
    "regex=s"   => \$regex,
    "verbose=i" => \$verbose,
    "debug=i"   => \$debug,
);
system "mkdir -p $out_dir" unless -e $out_dir;
 
# read in_file
my %seq_hash;    #key = seq_name, value = seq
{
    open IN, "<$in_file" or die "Can't open input file $in_file: $!\n";
 
    # redefine the record separator
    local $/ = ">";
    my $in_line = <IN>;    # toss the first record, which only consists of ">"
    while ( my $in_line = <IN> ) {
        chomp $in_line;
        next unless $in_line;
        my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 );
        $seq =~ tr/ \t\n\r//d;    # remove whitespace
        $seq_hash{$seq_name} = $seq;
    }
    close IN;
}
my $count_in = scalar keys %seq_hash;
 
my $out_trimmed = $out_dir . 'trimmed.fasta';
my $out_untrimmed = $out_dir . 'untrimmed.fasta';
 
my $count_trimmed = 0;
my $count_untrimmed = 0;
open OUT_TRIMMED, ">$out_trimmed" 
  or die "Can't open output file $out_trimmed: $!\n";
open OUT_UNTRIMMED, ">$out_untrimmed" 
  or die "Can't open output file $out_untrimmed: $!\n";
foreach my $seq_name ( sort keys %seq_hash ) {
    my $seq = $seq_hash{$seq_name};
    if ( $seq =~ m/$regex/ ) {
        my $trimmed_seq = $1;
        print OUT_TRIMMED ">$seq_name\n$trimmed_seq\n";
        $count_trimmed++;
    }
    else {
        print OUT_UNTRIMMED ">$seq_name\n$seq\n";
        $count_untrimmed++;
    }
}
close OUT_UNTRIMMED;
close OUT_TRIMMED;
 
if ($verbose) {
    print "count_in = $count_in\n";
    print "count_trimmed = $count_trimmed (";
    printf "%.3f", ( $count_trimmed / $count_in );
    print ")\n";
    print "count_untrimmed = $count_untrimmed (";
    printf "%.3f", ( $count_untrimmed / $count_in );
    print ")\n";
}
 
if ($debug) {
}
 
exit(0);