MetagenomicScripts/0000700000076700000240000000000011364301503015024 5ustar sarahpalmerstaffMetagenomicScripts/.DS_Store0000700000076700000240000001400411364301775016524 0ustar sarahpalmerstaffBud1‡Script JavaScriptsfwi0blobicnv JavaScriptsfwswlongÖ  @€ @€ @€ @ E‡DSDB ` @€ @€ @MetagenomicScripts/JavaScripts/0000700000076700000240000000000011364301775017270 5ustar sarahpalmerstaffMetagenomicScripts/JavaScripts/clean_454.java0000700000076700000240000002247211242272525021616 0ustar sarahpalmerstaffimport java.io.*; import java.util.*; public class clean_454{ private static utils_17_01_08 j = new utils_17_01_08(); private String fasta = ""; private static Date D = new Date(); String inputFile; boolean lookForN; public clean_454( String inputFileIn, boolean lookForN_in ){ inputFile = inputFileIn; lookForN = lookForN_in; }//end constructor public void processFasta( String fastaIn, String repeats, String lowComplex ){ String [] fastaArray = fastaIn.split( ":" );//remove keys and line numbers fasta = fastaArray[0]; //from previous methods //j.print( "fasta = "+fasta ); this.check_nucleotides( fasta, repeats, lowComplex ); }//end method public void check_nucleotides( String fasta, String repeats, String lowComplex ){ //--------------------------------------------------- //`Bins' sequences if they contain only one or two //Nucleotides, or if there is an `N'. //--------------------------------------------------- //j.print( "In check_nucleotides" ); int A = 0; int T = 0; int G = 0; int C = 0; int N = 0; int missingNucleotides = 0; String sequence = ""; //j.print( "fasta = "+fasta ); sequence = fasta.substring( 74 ); if( j.matches( "\\/\\/END", sequence ) ){ j.print( sequence ); sequence = sequence.replace( "\\/\\/END", "" ); }//end if for( int i = 0; i < sequence.length(); i ++ ){ //j.print( sequence.substring( i, i+1 ) ); if( sequence.substring( i, i+1 ).equals( "A" ) ){A++;} if( sequence.substring( i, i+1 ).equals( "T" ) ){T++;} if( sequence.substring( i, i+1 ).equals( "G" ) ){G++;} if( sequence.substring( i, i+1 ).equals( "C" ) ){C++;} if( sequence.substring( i, i+1 ).equals( "N" ) ){N++;} //else{N++;}//A non nucleotide character }//end for if( A == 0 ){missingNucleotides ++;}//end if if( T == 0 ){missingNucleotides ++;}//end if if( G == 0 ){missingNucleotides ++;}//end if if( C == 0 ){missingNucleotides ++;}//end if if( N > 0 && lookForN == true ){//check to see that the -n for remove fasta += "----N----"; //sequences wuth an N has been input. }//end if else if( missingNucleotides >= 2 ){ fasta += "----2<= nucleotides----"; }//end if //j.print( "fasta = "+fasta ); if( j.matches( "----N----", fasta ) ){ j.append( "JK_"+inputFile+"_has_N.fasta", fasta +"\n" ); }//end if else if( j.matches( "----2<= nucleotides----", fasta ) ){ j.append( "JK_"+inputFile+"_2_nucl.fasta", fasta +"\n" ); }//end else if else{ this.check_for_satellites( fasta, repeats, lowComplex ); }//end if //j.print( "missingNucleotides = "+missingNucleotides ); }//end method public void findLowComplexityRegions( String fasta, String repeats, String lowComplex ){ //j.print( "In findLowComplexityRegions" ); String sequence = ""; String group = ""; sequence = fasta.substring( 74 ); if( j.matches( "//END", sequence ) ){ sequence = sequence.replace( "//END", "" ); }//end if //Note - we concatenate the `groups' to asses the entire //sequence coverage.. if( j.matches( "(A{10,})", sequence ) ){ group += j.matchGroup( "(A{10,})", sequence, 0 ); //j.print( "group = "+group); }//end if if( j.matches( "(T{10,})", sequence ) ){ group += j.matchGroup( "(T{10,})", sequence, 0 ); //j.print( "group = "+group); }//end if if( j.matches( "(G{10,})", sequence ) ){ group += j.matchGroup( "(G{10,})", sequence, 0 ); //j.print( "group = "+group); }//end if if( j.matches( "(C{10,})", sequence ) ){ group += j.matchGroup( "(C{10,})", sequence, 0 ); //j.print( "group = "+group); }//end if if( sequence.length() - group.length() >= 15 ){ this.check_for_satellites( fasta, repeats, lowComplex ); }//end if else{ j.append( "JK_"+inputFile+"_lowComplex.fasta", fasta +"\n" ); //j.append( "lowComplex_seqs.txt", fasta+"\n" ); }//end else }//end method public void check_for_satellites( String fasta, String repeats, String lowComplex ){ //------------------------------------------------- //Removes repeating subsets of sequence, from 2 to //10 residues. Moves through the sequence one residue //at a time, at each position will take a substring //from 1 to 10 residues, and attempts to match it //as a regula expression. //------------------------------------------------- //j.print( "In check_for_satellites" ); Vector list = new Vector(); Vector nonSatellites = new Vector();//for non-satellites String nonSat = ""; String sequence = ""; String group = ""; int groupsLength = 0; sequence = fasta.substring( 74 ); if( j.matches( "//END", sequence ) ){ sequence = sequence.replace( "//END", "" ); }//end if //Now check for satellites //j.print( sequence ); I:for( int i = 0; i < sequence.length(); i ++ ){ String segment = ""; String expression = ""; String miniSeq = ""; K:for( int k = 2; k <= 10; k ++ ){ try{ segment = sequence.substring( i, i+k ); }catch( StringIndexOutOfBoundsException e ){continue;} //if( (segment.length() * Integer.parseInt( repeats ) ) <= sequence.substring( i ).length() ){ //make sure number of desired repeats //doesn't run past end of sequence expression = "(("+segment+"){"+repeats+",}+)"; //j.print( "expression = "+expression+" i = "+i ); if( j.matches( expression, sequence ) ){ group = j.matchGroup( expression, sequence, 0 ); //j.print( "group = "+group ); if( !list.contains( group ) ){ list.add( group ); }//end if i += group.length(); //break K; }//end if //}//end if /*else{//if the number of nucleotides in desired repeat is too much expression = "(("+segment+"){1,}+)"; j.print( "expression = "+expression+" i = "+i ); if( j.matches( expression, sequence ) ){ group = j.matchGroup( expression, sequence, 0 ); j.print( "group = "+group ); if( !list.contains( group ) ){ list.add( group ); }//end if i += group.length(); //break K; }//end if }//end if */ }//end for }//end for*/ for( int z = 0; z < list.size(); z ++ ){ String element = (String)list.get( z ); groupsLength += element.length(); }//end for if( !list.isEmpty() ){ int difference = sequence.length() - groupsLength; //j.print( "sequence.length = "+sequence.length()+" groupsLength = "+groupsLength+" difference = "+ difference ); if( ( sequence.length() - groupsLength ) < 15 ){ fasta += "----SATELLITE----"; }//end if }//end if //j.printVector( list ); if( j.matches( "----SATELLITE----", fasta) ){ j.append( "JK_"+inputFile+"_Satellites.fasta", fasta +"\n" ); }//end if else{ this.bin_by_length( fasta ); }//end else }//end method*/ public void bin_by_length( String fasta ){ if( fasta.substring( 74 ).length() < 15 ){ j.append( "JK_"+inputFile+"_SL_<=15.fasta", fasta +"\n" ); }//end if if( fasta.substring( 74 ).length() >= 15 && fasta.substring( 74 ).length() <= 50){ j.append( "JK_"+inputFile+"_SL_15-50.fasta", fasta +"\n" ); }//end if else if( fasta.substring( 74 ).length() > 50 && fasta.substring( 74 ).length() <= 100){ j.append( "JK_"+inputFile+"_SL_51-100.fasta", fasta +"\n" ); }//end if else if( fasta.substring( 74 ).length() > 100 ){ j.append( "JK_"+inputFile+"_SL_100+.fasta", fasta +"\n" ); }//end if }//end method }//end class MetagenomicScripts/PerlScripts/0000700000076700000240000000000011364301455017304 5ustar sarahpalmerstaffMetagenomicScripts/PerlScripts/BLASTout_compilerv3.pl0000700000076700000240000000365511364276235023424 0ustar sarahpalmerstaff#!/usr/bin/perl -w use strict; # # # BLASTout_compilerv3.pl # version 3 # 28/01/2010 # Sarah Palmer s.a.palmer@warwick.ac.uk # (C) # # Script to compile 3 Blast output files (such as those generated against the nt, est and gss databases) # in a format readable by MEGAN. # # Parameters: # # input_filename_1 # input_filename_2 # input_filename_3 # output_filename # # Changes: # # None. # # # Known problems: # # None. # my $BLASTout_1 = $ARGV[0]; my $BLASTout_2 = $ARGV[1]; my $BLASTout_3 = $ARGV[2]; my $output_filename = $ARGV[3]; open (BLASTOUT1, $BLASTout_1) or die ("Cannot open BLAST output: $!\n"); open (BLASTOUT2, $BLASTout_2) or die ("Cannot open BLAST output: $!\n"); open (BLASTOUT3, $BLASTout_3) or die ("Cannot open BLAST output: $!\n"); open (OUTPUT, ">$output_filename") or die ("Cannot open output file: $!\n"); $/ = "\nEffective search space used:"; my $query; my $query_header; my $nt_hits; my $est_hits; my $gss_hits; my @query_headers = (); while () { #print "$_"; if (m/Query=..(.*?)\nLength=...\n(.*?)\nLambda/s) { $query_header = "Query= ".$1; $nt_hits = $2."\n"; #print "$nt_hits\n"; $est_hits = $query_header; $gss_hits = $query_header; if ($est_hits = ) { #print "I got in here"; if ($est_hits =~ m/Length=...\n(.*?)\nLambda/s) { #print "I think I can $1\n"; $est_hits = $1; } } else { print "Try again"; } if ($gss_hits = ) { #print "I got in here"; if ($gss_hits =~ m/Length=...\n(.*?)\nLambda/s) { #print "I think I can $1\n"; $gss_hits = $1; } } else { print "Try again"; } print OUTPUT "$query_header\n\n $nt_hits\n $est_hits\n $gss_hits\n\n\n\n\n"; # NB. $query_header can just be the one line starting Query= as blast prints it. } } close BLASTOUT1; close BLASTOUT2; close BLASTOUT3; close OUTPUT; exit;MetagenomicScripts/PerlScripts/duplication_eliminatorv2.pl0000700000076700000240000000455211327572533024666 0ustar sarahpalmerstaff#!/usr/bin/perl -w ## # duplication_eliminatorv2.pl # version 2 # 25/01/2010 # Sarah Palmer s.a.palmer@warwick.ac.uk # (c) # This script will find and remove exact duplicate sequences in # fasta files, such as that returned from 454 sequencing. Output # files are written to the same folder as $query with the suffix: # .uniques.out # # Parameters: # # $query -> Fasta file to find duplicates in # # # Changes: # # None. # # # Known problems: # # None. # my $query = $ARGV[0]; # fasta file to find duplicates in open (OUT, ">$query.uniques.out") or die "Could not open output file\n"; #print OUT "sequence_name\tsequence\n"; my %query_fas = (); my $seq_name; my $seq; my $first = 1; open (QUERY, $query) or die "Could not open file $query\n"; while (my $line = ) { chomp $line; if ($line =~/\>/) { my @line = split / /, $line; if ($first == 1) { $first = 0; } else { %query_fas->{$seq} = $seq_name; } $seq = $line[1]; $seq_name = $line[0]; } else { $seq .= $line; } } # Finish off last sequence in file %query_fas->{$seq} = $seq_name; #print "$query_fas{$seq}"; close QUERY; foreach my $seq (keys %query_fas) { print OUT "$query_fas{$seq}\n$seq\n" } close OUT; exit; ## Subroutines sub Load_fasta () { my ($fasta_file, $ref) = @_; my $seq_name; my $seq; my $first = 1; open (FAS, $fasta_file) or die "Could not open $fasta_file.\n"; while (my $line = ) { chomp $line; if ($line =~/\>/) { @line = split / /, $line; if ($first == 1) { $first = 0; } else { $ref->{$seq_name} = $seq; } $seq = ''; $seq_name = substr($line[1],0); } else { $seq .= $line; } } # Finish off last sequence in file $ref->{$seq_name} = $seq; close FAS; } MetagenomicScripts/PerlScripts/Fasta_recovery.pl0000700000076700000240000000415311335235225022622 0ustar sarahpalmerstaff#!/usr/bin/perl -w use strict; # # # Fasta_recovery.pl # version 1 # 11/01/2010 # Jay Moore jay.moore@warwick.ac.uk # (c) # #This script will take a list of names and retrieve the fasta entries for those names. #Intended for use of Megan output lists to recover Fastas of interest. # # # Parameters: # # Fasta file # List of wanted query headers # output file name # # # Changes: # # None. # # # Known problems: # # None. # my $fasta = $ARGV[0]; open (FASTA, $fasta) or die ("Cannot open Fasta file: $!\n"); my $wantedlist = $ARGV[1]; open (WANTED, $wantedlist) or die ("Cannot open wanted file: $!\n"); my $outputname = $ARGV[2]; open (RECOVERED, ">$outputname") or die ("Cannot make parsed output file: $!\n"); my $seq_name; my $seq; my $first = 1; my %seqs = {}; while (my $line = ) { chomp $line; if ($line =~/\>/) { my @line = split / /, $line; if ($first == 1) { $first = 0; } else { $seqs{$seq_name} = $seq; } $seq = ''; $seq_name = substr($line[0],1); } else { $seq .= $line; } } # Finish off last sequence in file $seqs{$seq_name} = $seq; close FASTA; # for each my $seq_name ( sort keys %seqs) { # print $seq_name."\t".$seqs{$seq_name}."\n"; #} # my $wantedname; my @wantedarray = (); my $wantednamecount = 0; while (my $wanted = ) { if ($wanted =~ m/^>(.*)/) { $wantedname = $1; #print "$wantedname\n"; push (@wantedarray, $wantedname); ++$wantednamecount; } } print "The number of wants is: $wantednamecount\n"; #print "@wantedarray\n"; my @matches = (); foreach my $wantedone (@wantedarray) { print "searching for $wantedone...\n"; #if ($wantedone =~ @seqs) if (defined $seqs{$wantedone}) { print "$wantedone is there...\n"; print RECOVERED ">$wantedone\n$seqs{$wantedone}\n"; } } #print "@matches\n"; close (WANTED) or die("Cannot close file: $!"); #close (FASTA) or die("Cannot close file: $!"); close (RECOVERED) or die("Cannot close file: $!"); exit; MetagenomicScripts/PerlScripts/Find_subjects.pl0000700000076700000240000000215011330055357022424 0ustar sarahpalmerstaff#!/usr/bin/perl -w use strict; # # # Find_subjects.pl # version 1 # 11/01/2010 # Jay Moore jay.moore@warwick.ac.uk # (c) # # Script to parse a blast job and return query headers which align to a subject containing a # text search term in the header. # # # Parameters: # # input_filename # search_string # # Changes: # # None. # # # Known problems: # # None. # my $input_filename = $ARGV[0]; my $search_string = $ARGV[1]; open SOURCEFILE, $input_filename; open (HIT, ">$input_filename".".$search_string") or die("Cannot make hit file: $!\n"); my $this_query; my $doing_hits = 0; my $reported_this_one; while () { if ($_ =~ /^Query=/) { chomp $_; $this_query = substr($_, 8); $doing_hits = 1; $reported_this_one = 0; } elsif (($_ =~ /Length/) && ($doing_hits == 1)) { $doing_hits = 0; } elsif (($_ =~/^\>/) && ($_ =~ /$search_string/icg) && ($reported_this_one == 0)) { print HIT ">$this_query\n"; $reported_this_one = 1; } elsif ($doing_hits) { chomp $_; $this_query .= $_; } } close SOURCEFILE; close (HIT) or die("Cannot close hit result file"); exit;MetagenomicScripts/PerlScripts/Reannotatorv3.pl0000700000076700000240000000671511330057233022415 0ustar sarahpalmerstaff#!/usr/bin/perl -w use strict; use warnings; # # # Reannotatorv3.pl # version 3 # 11/01/2010 # Jay Moore jay.moore@warwick.ac.uk # (c) # # Script to search and replace incorrectly annotated Subject headers (eg. Human, not Homo sapiens) # in a BLAST output file. In addition, this script finds the term similar and cuts the header name # at that point (replaces with "cut, date"). New output file has suffix .parsed. # This enables MEGAN to correctly assign these sequences to a species level. # # # Parameters: # # input_filename # search_string # # Restrictions: # Terms to be replaced are not exhaustive. Included in this version as as follows # Human => Homo sapiens # Cotton => Gossypium hirsutum # G.hirsutum => Gossypium hirsutum # G.herbaceum => Gossypium herbaceum # G.raimondii => Gossypium raimondii # G.arboreum => Gossypium arboreum # G.barbadense => Gossypium barbadense # COT => Gossypium hirsutum # M.trunculata => Medicago trunculata # S.lycopersicum => Solanum lycopersicum # # # Changes: # # None. # # # Known problems: # # None. # # # my $BLASTout = $ARGV[0]; open (BLASTOUT, $BLASTout) or die("Cannot open BLAST output: $!\n"); open (PARSED, ">$BLASTout"."parsed") or die("Cannot make parsed output file: $!\n"); my $hitname; my $doing_header = 0; my $header = ''; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $year += 1900; $mon++; $mon = "0$mon" if $mon < 10; $mday = "0$mday" if $mday < 10; my $today = "$mday/$mon/$year"; while() { my $line = $_; chomp $line; if ($line =~ /^\>/) { $doing_header = 1; $header = $line; } elsif ($doing_header) { if ($line =~ /^Length=/) { $doing_header = 0; $header =~ s/\| Human/\| Homo sapiens/; $header =~ s/\| Cotton/\| Gossypium hirsutum/; $header =~ s/\| G.hirsutum/\| Gossypium hirsutum/; $header =~ s/\| G.herbaceum/\| Gossypium herbaceum/; $header =~ s/\| G.raimondii/\| Gossypium raimondii/; $header =~ s/\| G.arboreum/\| Gossypium arboreum/; $header =~ s/\| G.barbadense/\| Gossypium barbadense/; $header =~ s/\|COT/\| Gossypium hirsutum /; $header =~ s/\| M.trunculata/\| Medicago trunculata/; $header =~ s/\| S.lycopersicum/\| Solanum lycopersicum/; $header =~ s/- Arabidopsis thaliana/- \[cut, $today\]/; if ($header =~ m/contains similarity to/) { my $trimming = 1; my $i = 0; my $new_header = ''; while ($trimming) { $new_header .= substr($header,$i,1); if ($new_header =~ m/contains similarity /) { $trimming = 0; } $i++; } $new_header .= "[cut, $today]"; $header = $new_header; } if ($header =~ m/similar to/) { my $trimming = 1; my $i = 0; my $new_header = ''; while ($trimming) { #print "$new_header\n"; $new_header .= substr($header,$i,1); if ($new_header =~ m/similar /) { $trimming = 0; } $i++; } $new_header .= "[cut, $today]"; $header = $new_header; } if ($header =~ m/.*\[.*\].*/) { my $trimming = 1; my $i = 0; my $new_header = ''; while ($trimming) { $new_header .= substr($header,$i,1); if ($new_header =~ m/\[/) { $trimming = 0; } $i++; } $new_header .= "cut, $today]"; $header = $new_header; } print PARSED "$header\n"; print PARSED "$line\n"; } else { $header .= $line; } } else { print PARSED "$line\n"; } } close BLASTOUT; close PARSED; exit; MetagenomicScripts/PerlScripts/WittleoutFasta.pl0000700000076700000240000000176511330061365022630 0ustar sarahpalmerstaff#!/usr/bin/perl -w use strict; # # # WittleoutFasta.pl # version 1 # 15/01/2010 # Jay Moore jay.moore@warwick.ac.uk # (c) # # Script to parse a FASTA file and generate an output file only containing records not # in a second FASTA file. # # Parameters: # # input_filename # filter_filename # # # Changes: # # None. # # # Known problems: # # None. # my $input_filename = $ARGV[0]; my $filter_filename = $ARGV[1]; open SOURCEFILE, $input_filename; open FILTERFILE, $filter_filename; my %filter = (); while () { if ($_ =~/^\>/) { $filter{$_} = 1; } } close FILTERFILE; open (HIT, ">$input_filename".".filtered") or die("Cannot make hit file: $!\n"); my $this_one_is_ok = 0; while () { if ($_ =~ /^\>/) { if (defined $filter{$_}) { $this_one_is_ok = 0; } else { $this_one_is_ok = 1; print HIT $_; } } elsif ($this_one_is_ok == 1) { print HIT $_; } } close SOURCEFILE; close (HIT) or die("Cannot close hit result file"); exit;