User Tools

Site Tools


tutorials:perl:trim_lucy.2.pl
trim_lucy.2.pl
#!/usr/bin/perl -w
 
my $script_name = 'trim_lucy.2.pl';
 
# Chih-Horng Kuo <chkuo@lifedev.org>
# trim lucy output
# v2 2011/01/12
#   add min_start opt
# v1 2010/12/31
 
use strict;
use warnings;
 
use Getopt::Long;
 
my $in_fasta_dir;
my $in_qual_dir;
my $out_fasta_dir;
my $out_qual_dir;
my $in_fasta_file_ext;
my $out_fasta_file_ext;
my $in_qual_file_ext;
my $out_qual_file_ext;
my $min_start;
my $verbose;
my $debug;
 
GetOptions(
    "in_fasta_dir=s"       => \$in_fasta_dir,
    "in_qual_dir=s"        => \$in_qual_dir,
    "out_fasta_dir=s"      => \$out_fasta_dir,
    "out_qual_dir=s"       => \$out_qual_dir,
    "in_fasta_file_ext=s"  => \$in_fasta_file_ext,
    "out_fasta_file_ext=s" => \$out_fasta_file_ext,
    "in_qual_file_ext=s"   => \$in_qual_file_ext,
    "out_qual_file_ext=s"  => \$out_qual_file_ext,
    "min_start=i"          => \$min_start,
    "verbose=i"            => \$verbose,
    "debug=i"              => \$debug,
);
 
system "mkdir -p $out_fasta_dir" unless -e $out_fasta_dir;
system "mkdir -p $out_qual_dir"  unless -e $out_qual_dir;
$in_fasta_file_ext  = $in_fasta_file_ext  ? $in_fasta_file_ext  : "fasta";
$out_fasta_file_ext = $out_fasta_file_ext ? $out_fasta_file_ext : "fasta";
$in_qual_file_ext   = $in_qual_file_ext   ? $in_qual_file_ext   : "qual";
$out_qual_file_ext  = $out_qual_file_ext  ? $out_qual_file_ext  : "qual";
 
my $count_file = 0;
my @file_ids;
opendir( DIR, $in_fasta_dir ) or die "can't open $in_fasta_dir: $!\n";
while ( defined( my $in_file = readdir(DIR) ) ) {
    if ( $in_file =~ /(\S+)\.$in_fasta_file_ext$/ ) {
        push @file_ids, $1;
        $count_file++;
    }
}
closedir(DIR);
 
foreach my $file_id ( sort @file_ids ) {
    my $in_fasta_file = $in_fasta_dir . $file_id . '.' . $in_fasta_file_ext;
    my $in_qual_file  = $in_qual_dir . $file_id . '.' . $in_qual_file_ext;
    my $out_fasta_file = $out_fasta_dir . $file_id . '.' . $out_fasta_file_ext;
    my $out_qual_file = $out_qual_dir . $file_id . '.' . $out_qual_file_ext;
    if ( -e $in_qual_file ) {
        my %seq_hash; # key = seq_name, value = seq
        my %start_hash; # key = seq_name, value = start
        my %end_hash; # key = seq_name, value = end
        my %length_hash; # key = seq_name, value = length
 
        my $in_line;
        # redefine the record separator
        local $/ = ">";
        open IN, "<$in_fasta_file" or die "Can't open input file $in_fasta_file: $!\n";
        $in_line = <IN>;    # toss the first record, which only consists of ">"
        while ( my $in_line = <IN> ) {
            chomp $in_line;
            my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 );
            $seq =~ tr/ \t\n\r//d;    # Remove whitespace
 
            my @name_fields = split /\s/, $seq_name;
            my $out_name = $name_fields[0];
            my $out_start = $name_fields[4];
            my $out_end = $name_fields[5];
 
            $seq_hash{$out_name} = $seq;
            $start_hash{$out_name} = $out_start;
            $end_hash{$out_name} = $out_end;
            $length_hash{$out_name} = length $seq;
        }
        close IN;
 
        my %qual_HoA; # key = seq_name, value = list of quality values
        open IN, "<$in_qual_file" or die "Can't open input file $in_qual_file: $!\n";
        $in_line = <IN>;    # toss the first record, which only consists of ">"
        while ( my $in_line = <IN> ) {
            chomp $in_line;
            my ( $seq_name, $qual ) = split( /\n/, $in_line, 2 );
            $qual =~ tr/\n\r//d;    # Remove newlines
            @{ $qual_HoA{$seq_name} } = split /\s+/, $qual;
        }
        close IN;
 
        my $count_in = scalar keys %seq_hash;
        my $count_out = 0;
 
        open OUT_FASTA, 
          ">$out_fasta_file" or die "Can't open output file $out_fasta_file: $!\n";
        open OUT_QUAL, 
          ">$out_qual_file" or die "Can't open output file $out_qual_file: $!\n";
        foreach my $seq_name ( sort keys %seq_hash ) {
            # check length
            if ( $start_hash{$seq_name} < 1 ) {
                warn "$file_id: $seq_name start < 1!\n";
                next;
            }
            if ( $end_hash{$seq_name} > $length_hash{$seq_name} ) {
                warn "$file_id: $seq_name end > length!\n";
                next;
            }
            if ( $length_hash{$seq_name} =! scalar @{ $qual_HoA{$seq_name} } ) {
                warn "$file_id: $seq_name inconsistent length info!\n";
                next;
            }
            if ( defined $min_start && $start_hash{$seq_name} > $min_start ) {
                next;
            }
 
            my $out_start = $start_hash{$seq_name} - 1;
            my $out_end = $end_hash{$seq_name} - 1;
            my $out_length = $out_end - $out_start + 1;
            my $out_seq = substr ( $seq_hash{$seq_name}, $out_start, $out_length );
            print OUT_FASTA ">$seq_name\n$out_seq\n";
            print OUT_QUAL ">$seq_name\n";
            foreach my $index ( $out_start..$out_end ) {
                print OUT_QUAL "${ $qual_HoA{$seq_name} }[$index] ";
            }
            print OUT_QUAL "\n";
            $count_out++;
        }
        close OUT_FASTA;
        close OUT_QUAL;
 
        if ($verbose) {
            print "$file_id: ",
              "count_in = $count_in, ",
              "count_out = $count_out, ",
              "count_diff = ",
              ( $count_in - $count_out ),
              "\n";
        }
 
    }
    else {
        warn "$in_qual_file does not exist!\n";
    }
}
 
if ($verbose) {
    print "count_file = $count_file\n";
}
if ($debug) {
}
tutorials/perl/trim_lucy.2.pl.txt · Last modified: 2012/06/15 00:32 by chkuo