#!/usr/bin/perl -w
# The program fastq_qual_stats.pl takes fastq files and calculates summary statistics for the quality scores.
# For each base position, the mean quality score is calculated and the quality scores are sorted in 6 bins
# from 0-9, 10-19, 20-29, 30-39, 40-49, >=50. The number of bases and % of total bases at that position in each bin is given as output.
# Copyright (C) 2012 Minou Nowrousian
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details (http://www.gnu.org/licenses/).
use strict;
use warnings;
use Getopt::Std;
my %opts = (o=>33,);
getopts('o:', \%opts);
my $summary = qq(
Summary: fastq_qual_stats.pl
Options: -o offset for the quality scores (default: 33,
for certain types of Illumina pipelines
it can be 64, then this option must be set)
Important: Bases and quality sequences must be only ONE line each (i.e. four lines containing the information for each read).
\n);
die($summary) if ($opts{"o"} =~/\D/); ## only numeric
my %positions = (); ## reference to an anonymous array, key is position,
## values are sum of quality scores for all reads, number of counted reads with base at this position,
## number of reads with bases in each of the 6 bins (see above)
my @output_stats; ## array for output stats columns are base position, mean quality and no. of bases and % of bases for each bin,
## lines are base position starting with base 1.
my $outfile;
if ( (@ARGV == 1) && (-e $ARGV[0]) ) {
print "\ndetermine quality scores of reads in file $ARGV[0]:\n quality score offset $opts{'o'}\n\n";
$outfile = $ARGV[0] . ".qualstats";
open INPUTFILE, "< $ARGV[0]" or die "Kann Datei nicht oeffnen: $!";
my $anzahl_reads = 0;
while () { # parses through lines of inputfile
my $line1 = $_;
if ($line1 !~ /^@/) {
print "first line of read does not start with @ at read $anzahl_reads\n";
}
$_ = ;
my $line2 = $_;
$_ = ;
my $line3 = $_;
$_ = ;
my $line4 = $_;
$anzahl_reads += 1;
my @qual = &get_qual($line4);
my $n = 0; ## counter for the number of bases in the read
while (@qual) {
my $qual = shift(@qual);
$n++;
unless (exists $positions{$n}) { ## if this is the first time that this base position is encountered, initialize hash element
$positions{$n} = [0, 0, 0, 0, 0, 0, 0, 0];
}
$positions{$n}->[0] += $qual; ## first element of array is sum of quality scores
$positions{$n}->[1] += 1; ## second element of array is the number of counted bases at this position
if ($qual >= 50) { ## elements 2-7 of array are number of reads for the six bins
$positions{$n}->[7] += 1;
} elsif ($qual >= 40) {
$positions{$n}->[6] += 1;
} elsif ($qual >= 30) {
$positions{$n}->[5] += 1;
} elsif ($qual >= 20) {
$positions{$n}->[4] += 1;
} elsif ($qual >= 10) {
$positions{$n}->[3] += 1;
} else {
$positions{$n}->[2] += 1;
}
}
}
## now prepare output information
push(@output_stats, "base position\ttotal number of bases\tmean quality score\tbin 0-9 bases\tbin 0-9 % of bases\tbin 10-19 bases\tbin 10-19 % of bases\tbin 20-29 bases\tbin 20-29 % of bases\tbin 30-39 bases\tbin 30-39 % of bases\tbin 40-49 bases\tbin 40-49 % of bases\tbin >50 bases\tbin >50 % of bases\n");
foreach my $position (sort { $a <=> $b } keys %positions) {
my ($qual_sum, $total_bases, $bin1, $bin2, $bin3, $bin4, $bin5, $bin6) = @{$positions{$position}};
my $mean = sprintf "%.1f", $qual_sum / $total_bases; ## sum of quality scores / number of bases
my $percent1 = sprintf "%.1f", $bin1 * 100 / $total_bases;
my $percent2 = sprintf "%.1f", $bin2 * 100 / $total_bases;
my $percent3 = sprintf "%.1f", $bin3 * 100 / $total_bases;
my $percent4 = sprintf "%.1f", $bin4 * 100 / $total_bases;
my $percent5 = sprintf "%.1f", $bin5 * 100 / $total_bases;
my $percent6 = sprintf "%.1f", $bin6 * 100 / $total_bases;
push(@output_stats, "$position\t$total_bases\t$mean\t$bin1\t$percent1\t$bin2\t$percent2\t$bin3\t$percent3\t$bin4\t$percent4\t$bin5\t$percent5\t$bin6\t$percent6\n");
}
open OUTPUT, "> $outfile" or die "Kann Datei nicht oeffnen: $!";
print OUTPUT @output_stats;
close OUTPUT;
print "Parsed $anzahl_reads reads in file $ARGV[0]\n";
} else {
die($summary);
}
sub get_qual {
## returns array with numeric quality scores
my $line4 = shift(@_);
$line4 =~ s/\s+$//; # remove all trailing whitespace, end of line, tabs etc.
my @qual;
my $laenge = length($line4);
$laenge = $laenge - 1;
for (my $i=0; $i<=$laenge; $i++) {
my $score = substr($line4, $i, 1);
my $quality = ord($score) - $opts{"o"};
push(@qual, $quality);
}
return @qual;
}