Skip to content
Snippets Groups Projects
fastaExtractrRNA4dom.pl 9.79 KiB
Newer Older
Anna Buschart's avatar
Anna Buschart committed
#!/usr/bin/perl


# this script takes a fasta file with contigs and the output of Barrnap runs for different kingdoms to produce a fasta file with rRNA sequences and a .tab file in the style of prodigal
# 6 inputs: - the fasta file of the contigs
#           - the .gff files from Barrnap (for eukaryota, bacteria, archaea and mitochondria)
#           - the prefix for the output
# 2 outputs: - a fasta file with the rRNA gene sequences, naming is the contig name appended with _r and a continuous number per contig
#	     - a table with contig name rRNA gene name (see above), the sense, length, start position, end position, completeness and kind (16S, 5S etc)

# Anna Heintz-Buschart, June 2016, based on code written in November 2014



use strict;
use warnings;
use Bio::DB::Fasta;
use List::MoreUtils qw(uniq);
use List::Util qw(min max);
use Getopt::Long;

my ($fastaFile,$eukFile,$arcFile,$bacFile,$mitoFile,$output);
GetOptions('f=s' => \$fastaFile,
	   'e=s' => \$eukFile,
	   'a=s' => \$arcFile,
           'b=s' => \$bacFile,
           'm=s' => \$mitoFile,
           'o=s' => \$output);

my $listFile = $output.".tab";
my $geneFile = $output.".fa";


my %rRNAs = (); #0: start, 1: end, 2: sense, 3:kind; 4:eval

open (IN, $bacFile);
while (my $line = <IN>){
    unless ($line =~ /^#/) { 
        chomp $line;
        my($contig, undef, undef, $start, $end, $eval, $sense, undef, $attribute) = split "\t", $line;
        my $completeness = "";
	if ($attribute =~ /partial/){
            $completeness = " incomplete"
        } else {
            $completeness = " complete"
        }
        my @attributes = split ";", $attribute,2;
        my $kind = $attributes[0];
        substr($kind, 0 , 5) = "";
        $kind .= " bacterial".$completeness;
        push @{$rRNAs{$contig}[0]}, $start;
        push @{$rRNAs{$contig}[1]}, $end;
        push @{$rRNAs{$contig}[2]}, $sense;
        push @{$rRNAs{$contig}[3]}, $kind;
        push @{$rRNAs{$contig}[4]}, $eval;
    }
}
close(IN);

open (IN, $arcFile);
while (my $line = <IN>){
    unless ($line =~ /^#/) { 
        chomp $line;
        my($contig, undef, undef, $start, $end, $eval, $sense, undef, $attribute) = split "\t", $line;
        my $completeness = "";
	if ($attribute =~ /partial/){
            $completeness = " incomplete"
        } else {
            $completeness = " complete"
            }
        my @attributes = split ";", $attribute,2;
        my $kind = $attributes[0];
        substr($kind, 0 , 5) = "";
        $kind .= " archaeal".$completeness;
        push @{$rRNAs{$contig}[0]}, $start;
        push @{$rRNAs{$contig}[1]}, $end;
        push @{$rRNAs{$contig}[2]}, $sense;
        push @{$rRNAs{$contig}[3]}, $kind;
        push @{$rRNAs{$contig}[4]}, $eval;
    }
}
close(IN);

open (IN, $eukFile);
while (my $line = <IN>){
    unless ($line =~ /^#/) { 
        chomp $line;
        my($contig, undef, undef, $start, $end, $eval, $sense, undef, $attribute) = split "\t", $line;
        my $completeness = "";
	if ($attribute =~ /partial/){
            $completeness = " incomplete"
        } else {
            $completeness = " complete"
            }
        my @attributes = split ";", $attribute,2;
        my $kind = $attributes[0];
        substr($kind, 0 , 5) = "";
        $kind .= " eukaryotic".$completeness;
        push @{$rRNAs{$contig}[0]}, $start;
        push @{$rRNAs{$contig}[1]}, $end;
        push @{$rRNAs{$contig}[2]}, $sense;
        push @{$rRNAs{$contig}[3]}, $kind;
        push @{$rRNAs{$contig}[4]}, $eval;
    }
}
close(IN);

open (IN, $mitoFile);
while (my $line = <IN>){
    unless ($line =~ /^#/) { 
        chomp $line;
        my($contig, undef, undef, $start, $end, $eval, $sense, undef, $attribute) = split "\t", $line;
        my $completeness = "";
	if ($attribute =~ /partial/){
            $completeness = " incomplete"
        } else {
            $completeness = " complete"
        }
        my @attributes = split ";", $attribute,2;
        my $kind = $attributes[0];
        substr($kind, 0 , 5) = "";
        $kind .= " mitochondrial".$completeness;
        push @{$rRNAs{$contig}[0]}, $start;
        push @{$rRNAs{$contig}[1]}, $end;
        push @{$rRNAs{$contig}[2]}, $sense;
        push @{$rRNAs{$contig}[3]}, $kind;
        push @{$rRNAs{$contig}[4]}, $eval;
    }
}
close(IN);

my %allContigs = ();
my $db = Bio::DB::Fasta->new( $fastaFile );
open(GEN, ">", $geneFile) or die "cannot open $geneFile \n";
open(TAB, ">", $listFile) or die "cannot open $listFile \n";
print TAB "contig\tgene\tsense\tlength\tstart\tend\teval\tkind\n";
my @ids = keys %rRNAs;
foreach my $contig (@ids){
    my @useStarts = ();
    my @useEnds = ();
    my @useSenses = ();
    my @useKinds = ();
    my @useEvals = ();
    if ($#{$rRNAs{$contig}[0]} > 0) {
        my @starts = @{$rRNAs{$contig}[0]};
        my @ends = @{$rRNAs{$contig}[1]};
        my @senses = @{$rRNAs{$contig}[2]};
        my @kinds = @{$rRNAs{$contig}[3]};
        my @evals = @{$rRNAs{$contig}[4]};
        my @idx = sort { $starts[$a]+0 <=> 0+$starts[$b] } 0 .. $#starts;
        @starts = @starts[@idx];
        @ends = @ends[@idx];
        @senses = @senses[@idx];
        @kinds = @kinds[@idx];
        @evals = @evals[@idx];
        my $currentStart = $starts[0];
        my $currentEnd = $ends[0];
        my @currentAllStarts = ();
        my @currentAllEnds = ();
        my @currentSenses = ();
        my @currentKinds = ();
        my @currentEvals = ();
        for (my $i = 0; $i <= $#starts; $i++) { 
            if ($starts[$i] <= $currentEnd){
                $currentEnd = $ends[$i] if ($ends[$i] >= $currentEnd);
                push @currentAllStarts, $starts[$i];
                push @currentAllEnds, $ends[$i];
                push @currentSenses, $senses[$i];
                push @currentKinds, $kinds[$i];
                push @currentEvals, $evals[$i];
            } else {
                my @ind = ();
                my $testEval = $currentEvals[0];
                for (my $m = 0; $m <= $#currentEvals; $m++){
                    if ($testEval == $currentEvals[$m]) {
                        push @ind, $m;
                    } elsif ($testEval > $currentEvals[$m]) {
                        @ind = ($m);
                        $testEval = $currentEvals[$m];
                    }
                }
                unshift @useEvals, $testEval;
                if ($#ind > 0) {
                    my @uniqueSenses = uniq @currentSenses;
                    my @uniqueKinds = uniq @currentKinds;
                    unshift @useSenses, join('',@uniqueSenses);
                    unshift @useKinds, join(';',@uniqueKinds);
                    unshift @useStarts, min @currentAllStarts[@ind];
                    unshift @useEnds, max @currentAllEnds[@ind];
                } else {
                    unshift @useSenses, $currentSenses[$ind[0]];
                    unshift @useKinds, $currentKinds[$ind[0]];
                    unshift @useStarts, $currentAllStarts[$ind[0]];
                    unshift @useEnds, $currentAllEnds[$ind[0]];
                }
                $currentStart = $starts[$i];
                $currentEnd = $ends[$i];
                @currentAllStarts = ($starts[$i]);
                @currentAllEnds = ($ends[$i]);
                @currentSenses = ($senses[$i]);
                @currentKinds = ($kinds[$i]);
                @currentEvals = ($evals[$i]);
            }
        }
        my @ind = ();
        my $testEval = $currentEvals[0];
        for (my $m = 0; $m <= $#currentEvals; $m++){
            if ($testEval == $currentEvals[$m]) {
                push @ind, $m;
            } elsif ($testEval > $currentEvals[$m]) {
                @ind = ($m);
                $testEval = $currentEvals[$m];
            }
        }
        unshift @useEvals, $testEval;
        if ($#ind > 0) {
            my @uniqueSenses = uniq @currentSenses;
            my @uniqueKinds = uniq @currentKinds;
            unshift @useSenses, join('',@uniqueSenses);
            unshift @useKinds, join(';',@uniqueKinds);
            unshift @useStarts, min @currentAllStarts[@ind];
            unshift @useEnds, max @currentAllEnds[@ind];
        } else {
            unshift @useSenses, $currentSenses[$ind[0]];
            unshift @useKinds, $currentKinds[$ind[0]];
            unshift @useStarts, $currentAllStarts[$ind[0]];
            unshift @useEnds, $currentAllEnds[$ind[0]];
        }
    } else {
        push @useStarts, $rRNAs{$contig}[0][0];
        push @useEnds, $rRNAs{$contig}[1][0];
        push @useSenses, $rRNAs{$contig}[2][0];
        push @useKinds, $rRNAs{$contig}[3][0];
        push @useEvals, $rRNAs{$contig}[4][0];
    }
    for (my $j = 0 ; $j <= $#useStarts ; $j++ ) {
        if ($useSenses[$j] =~ /\+-|-\+/) {
            print STDERR "Ambiguous sense on $contig between $useStarts[$j] and $useEnds[$j]. \n";
            next;
        }
        if (exists $allContigs{$contig}) {
            $allContigs{$contig}++
        } else {
            $allContigs{$contig} = 1
        }
        my $gene = join("", $contig, "_r", $allContigs{$contig});
        my $length = 1 + $useEnds[$j] - $useStarts[$j];
        my $sequence = "";
        if ($useSenses[$j] eq "+") {
            $sequence = $db->seq($contig, $useStarts[$j], $useEnds[$j]);
        } else {
            $sequence = $db->seq($contig, $useEnds[$j], $useStarts[$j]);
        }
        if  (!defined( $sequence )) {
            print STDERR "Sequence $contig not found. \n";
            next;
        } else {
            print GEN ">$gene\n", "$sequence\n";
        }
        print TAB join("\t",$contig,$gene,$useSenses[$j],$length,$useStarts[$j],$useEnds[$j],$useEvals[$j],$useKinds[$j]), "\n";

    }
}
close(GEN);
close(TAB);