tutorials:perl:trim_lucy.2.pl
- trim_lucy.2.pl
#!/usr/bin/perl -w my $script_name = 'trim_lucy.2.pl'; # Chih-Horng Kuo <chkuo@lifedev.org> # trim lucy output # v2 2011/01/12 # add min_start opt # v1 2010/12/31 use strict; use warnings; use Getopt::Long; my $in_fasta_dir; my $in_qual_dir; my $out_fasta_dir; my $out_qual_dir; my $in_fasta_file_ext; my $out_fasta_file_ext; my $in_qual_file_ext; my $out_qual_file_ext; my $min_start; my $verbose; my $debug; GetOptions( "in_fasta_dir=s" => \$in_fasta_dir, "in_qual_dir=s" => \$in_qual_dir, "out_fasta_dir=s" => \$out_fasta_dir, "out_qual_dir=s" => \$out_qual_dir, "in_fasta_file_ext=s" => \$in_fasta_file_ext, "out_fasta_file_ext=s" => \$out_fasta_file_ext, "in_qual_file_ext=s" => \$in_qual_file_ext, "out_qual_file_ext=s" => \$out_qual_file_ext, "min_start=i" => \$min_start, "verbose=i" => \$verbose, "debug=i" => \$debug, ); system "mkdir -p $out_fasta_dir" unless -e $out_fasta_dir; system "mkdir -p $out_qual_dir" unless -e $out_qual_dir; $in_fasta_file_ext = $in_fasta_file_ext ? $in_fasta_file_ext : "fasta"; $out_fasta_file_ext = $out_fasta_file_ext ? $out_fasta_file_ext : "fasta"; $in_qual_file_ext = $in_qual_file_ext ? $in_qual_file_ext : "qual"; $out_qual_file_ext = $out_qual_file_ext ? $out_qual_file_ext : "qual"; my $count_file = 0; my @file_ids; opendir( DIR, $in_fasta_dir ) or die "can't open $in_fasta_dir: $!\n"; while ( defined( my $in_file = readdir(DIR) ) ) { if ( $in_file =~ /(\S+)\.$in_fasta_file_ext$/ ) { push @file_ids, $1; $count_file++; } } closedir(DIR); foreach my $file_id ( sort @file_ids ) { my $in_fasta_file = $in_fasta_dir . $file_id . '.' . $in_fasta_file_ext; my $in_qual_file = $in_qual_dir . $file_id . '.' . $in_qual_file_ext; my $out_fasta_file = $out_fasta_dir . $file_id . '.' . $out_fasta_file_ext; my $out_qual_file = $out_qual_dir . $file_id . '.' . $out_qual_file_ext; if ( -e $in_qual_file ) { my %seq_hash; # key = seq_name, value = seq my %start_hash; # key = seq_name, value = start my %end_hash; # key = seq_name, value = end my %length_hash; # key = seq_name, value = length my $in_line; # redefine the record separator local $/ = ">"; open IN, "<$in_fasta_file" or die "Can't open input file $in_fasta_file: $!\n"; $in_line = <IN>; # toss the first record, which only consists of ">" while ( my $in_line = <IN> ) { chomp $in_line; my ( $seq_name, $seq ) = split( /\n/, $in_line, 2 ); $seq =~ tr/ \t\n\r//d; # Remove whitespace my @name_fields = split /\s/, $seq_name; my $out_name = $name_fields[0]; my $out_start = $name_fields[4]; my $out_end = $name_fields[5]; $seq_hash{$out_name} = $seq; $start_hash{$out_name} = $out_start; $end_hash{$out_name} = $out_end; $length_hash{$out_name} = length $seq; } close IN; my %qual_HoA; # key = seq_name, value = list of quality values open IN, "<$in_qual_file" or die "Can't open input file $in_qual_file: $!\n"; $in_line = <IN>; # toss the first record, which only consists of ">" while ( my $in_line = <IN> ) { chomp $in_line; my ( $seq_name, $qual ) = split( /\n/, $in_line, 2 ); $qual =~ tr/\n\r//d; # Remove newlines @{ $qual_HoA{$seq_name} } = split /\s+/, $qual; } close IN; my $count_in = scalar keys %seq_hash; my $count_out = 0; open OUT_FASTA, ">$out_fasta_file" or die "Can't open output file $out_fasta_file: $!\n"; open OUT_QUAL, ">$out_qual_file" or die "Can't open output file $out_qual_file: $!\n"; foreach my $seq_name ( sort keys %seq_hash ) { # check length if ( $start_hash{$seq_name} < 1 ) { warn "$file_id: $seq_name start < 1!\n"; next; } if ( $end_hash{$seq_name} > $length_hash{$seq_name} ) { warn "$file_id: $seq_name end > length!\n"; next; } if ( $length_hash{$seq_name} =! scalar @{ $qual_HoA{$seq_name} } ) { warn "$file_id: $seq_name inconsistent length info!\n"; next; } if ( defined $min_start && $start_hash{$seq_name} > $min_start ) { next; } my $out_start = $start_hash{$seq_name} - 1; my $out_end = $end_hash{$seq_name} - 1; my $out_length = $out_end - $out_start + 1; my $out_seq = substr ( $seq_hash{$seq_name}, $out_start, $out_length ); print OUT_FASTA ">$seq_name\n$out_seq\n"; print OUT_QUAL ">$seq_name\n"; foreach my $index ( $out_start..$out_end ) { print OUT_QUAL "${ $qual_HoA{$seq_name} }[$index] "; } print OUT_QUAL "\n"; $count_out++; } close OUT_FASTA; close OUT_QUAL; if ($verbose) { print "$file_id: ", "count_in = $count_in, ", "count_out = $count_out, ", "count_diff = ", ( $count_in - $count_out ), "\n"; } } else { warn "$in_qual_file does not exist!\n"; } } if ($verbose) { print "count_file = $count_file\n"; } if ($debug) { }
tutorials/perl/trim_lucy.2.pl.txt · Last modified: 2012/06/15 00:32 by chkuo