[Bioperl-guts-l] [16520] bioperl-run/trunk: Crossbow file format handling implemented - unsafely.

Dan Kortschak kortsch at dev.open-bio.org
Thu Dec 17 20:55:33 EST 2009


Revision: 16520
Author:   kortsch
Date:     2009-12-17 20:55:33 -0500 (Thu, 17 Dec 2009)
Log Message:
-----------
Crossbow file format handling implemented - unsafely. Optionalised 'out' filespec. Bugfixes and crossbow addition to tests.

Modified Paths:
--------------
    bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm
    bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm
    bioperl-run/trunk/t/Bowtie.t

Added Paths:
-----------
    bioperl-run/trunk/t/data/bowtie/reads/e_coli.cb

Modified: bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm
===================================================================
--- bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm	2009-12-18 01:55:18 UTC (rev 16519)
+++ bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm	2009-12-18 01:55:33 UTC (rev 16520)
@@ -104,7 +104,8 @@
 our @program_commands = qw(
     single
     paired
-); #crossbow format not implemented yet - will attempt when I see what it looks like
+    crossbow
+);
 
 # composite commands: pseudo-commands that run a 
 # sequence of commands
@@ -118,7 +119,8 @@
 # prefixes only for commands that take params/switches...
 our %command_prefixes = (
     'single'     => 'one',
-    'paired'     => 'par'
+    'paired'     => 'par',
+    'crossbow'   => 'crb'
     );
 
 our @program_params = qw(
@@ -308,7 +310,60 @@
     'par|offrate'                  => 'o',
     'par|memory_mapped_io'         => 'mm',
     'par|shared_memory'            => 'shmem',
-    'par|random_seed'              => 'seed'
+    'par|random_seed'              => 'seed',
+
+    'crb|fastq'                    => 'q',
+    'crb|fasta'                    => 'f',
+    'crb|raw'                      => 'r',
+    'crb|inline'                   => 'c',
+    'crb|skip'                     => 's',
+    'crb|upto'                     => 'u',
+    'crb|trim5'                    => '5',
+    'crb|trim3'                    => '3',
+    'crb|phred33'                  => 'phred33-quals',
+    'crb|phred64'                  => 'phred64-quals',
+    'crb|solexa'                   => 'solexa-quals',
+    'crb|solexa1_3'                => 'solexa1.3-quals',
+    'crb|integer_qual'             => 'integer-quals',
+    'crb|max_seed_mismatches'      => 'n',
+    'crb|max_qual_mismatch'        => 'e',
+    'crb|max_quality_sum'          => 'Q',
+    'crb|seed_length'              => 'l',
+    'crb|no_maq_rounding'          => 'nomaqround',
+    'crb|max_mismatches'           => 'v',
+    'crb|min_insert_size'          => 'I',
+    'crb|max_insert_size'          => 'X',
+    'crb|forward_reverse'          => 'fr',
+    'crb|reverse_forward'          => 'rf',
+    'crb|forward_forward'          => 'ff',
+    'crb|no_forward_alignment'     => 'nofw',
+    'crb|no_reverse_alignment'     => 'norc',
+    'crb|max_backtracks'           => 'maxbts',
+    'crb|max_mate_attempts'        => 'pairtries',
+    'crb|try_hard'                 => 'y',
+    'crb|max_search_ram'           => 'chunkmbs',
+    'crb|report_n_alignments'      => 'k',
+    'crb|all'                      => 'a',
+    'crb|supress'                  => 'm',
+    'crb|best'                     => 'best',
+    'crb|strata'                   => 'strata',
+    'crb|fix_strand_bias'          => 'strandfix',
+    'crb|sam_format'               => 'S',
+    'crb|concise'                  => 'concise',
+    'crb|time'                     => 't',
+    'crb|offset_base'              => 'B',
+    'crb|quiet'                    => 'quiet',
+    'crb|ref_map'                  => 'refout',
+    'crb|ref_index'                => 'refidx',
+    'crb|alignmed_file'            => 'al',
+    'crb|unaligned_file'           => 'un',
+    'crb|excess_file'              => 'max',
+    'crb|full_ref_name'            => 'fullref',
+    'crb|threads'                  => 'p',
+    'crb|offrate'                  => 'o',
+    'crb|memory_mapped_io'         => 'mm',
+    'crb|shared_memory'            => 'shmem',
+    'crb|random_seed'              => 'seed'
     );
 
 #
@@ -326,17 +381,20 @@
 #
 
 our %command_files = (
-    'single'     => [qw( ind seq out )],
-    'paired'     => [qw( ind seq seq2 out )]
-    ); #crossbow format not implemented yet - will attempt when I see what it looks like
+    'single'     => [qw( ind seq #out )],
+    'paired'     => [qw( ind seq seq2 #out )],
+    'crossbow'   => [qw( ind seq #out )]
+    );
 
 INIT {
 	# bowtie doesn't really have subprograms so we do it this way
 	foreach (@program_params) {
 		push @program_params, "par\|".$1 if (m/^one\|(.*)/);
+		push @program_params, "crb\|".$1 if (m/^par\|(.*)/);
 	}
 	foreach (@program_switches) {
 		push @program_switches, "par\|".$1 if (m/^one\|(.*)/);
+		push @program_switches, "crb\|".$1 if (m/^par\|(.*)/);
 	}
 	
 #	# add subcommand params and switches for
@@ -362,4 +420,4 @@
 #	# translations for subcmd params/switches not necessary
 }
 
-1;
\ No newline at end of file
+1;

Modified: bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm
===================================================================
--- bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm	2009-12-18 01:55:18 UTC (rev 16519)
+++ bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm	2009-12-18 01:55:33 UTC (rev 16520)
@@ -362,11 +362,11 @@
     my $index=shift @files;
     for ($cmd) {
     	/^p/ && do {
-    		@files = map { ( $_ , shift @files ) } ('-1','-2','');
+    		@files = map { ( $_ , shift @files ) } ('-1','-2',undef);
     		last;
     	};
-    	/^c/ && do { # this will deal with crossbow files when I sort them out
-    		@files = unshift(@files,'--12','');
+    	/^c/ && do {
+    		@files = map { ( $_ , shift @files ) } ('--12',undef,undef);
     		last;
     	}
     }
@@ -456,43 +456,73 @@
 
 sub _prepare_input_sequences {
 
-	my ($self, @args) = @_;
-	my (%args, $read1);
-	if (grep (/^-/, @args)) { # named parms
-		$self->throw("Input args not an even number") unless !(@args % 2);
-		%args = @args;
-		($read1) = @args{qw( -read1 )};
-	} else {
-		($read1) = @args;
-	}
+        my ($self, @args) = @_;
+        my (%args, $read1);
+        if (grep (/^-/, @args)) { # named parms
+                $self->throw("Input args not an even number") unless !(@args % 2);
+                %args = @args;
+                ($read1) = @args{qw( -read1 )};
+        } else {
+                ($read1) = @args;
+        }
 
-	# Could use the AssemblerBase routine for this, except that would not permit
-	# an array of strings - not decided at this stage.
-
-	if (-e $read1) { # we have a file
-		my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$read1);
-		$guesser->guess =~ m/^fast[qa]$/ or $self->throw("Reads file doesn't look like fasta/q at arg 1");
-	} elsif ($read1->isa("Bio::PrimarySeqI")) { # we have a Bio::*Seq*
-		$read1=$read1->seq();
-	} else { # we have something else
-		if (ref($read1) =~ /ARRAY/i) {
-			my @ts;
-			foreach my $seq (@$read1) {
-				if ($seq->isa("Bio::PrimarySeqI")) {
-					$seq=$seq->seq();
-				} else {
-					next if $read1=~m/[[^:alpha:]]/;					
-				}
-				push @ts,$seq;
-			}
-			$read1=join(',', at ts);
-			$self->throw("bowtie requires at least one sequence read") unless (@ts);
-		} else { #must be a string... fail if non-alpha
-			$self->throw("bowtie requires at least one sequence read") if $read1=~m/[[^:alpha:]]/;
-		}
-	}
-	
-	return $read1;
+        # Could use the AssemblerBase routine for this, except that would not permit
+        # an array of strings - not decided at this stage.
+        if ($self->inline) { # expect inline data
+		        if ($read1->isa("Bio::PrimarySeqI")) { # we have a Bio::*Seq*
+		                $read1=$read1->seq();
+		        } else { # we have something else
+		                if (ref($read1) =~ /ARRAY/i) {
+		                        my @ts;
+		                        foreach my $seq (@$read1) {
+		                                if ($seq->isa("Bio::PrimarySeqI")) {
+		                                        $seq=$seq->seq();
+		                                } else {
+		                                        next if $read1=~m/[[^:alpha:]]/;
+		                                }
+		                                push @ts,$seq;
+		                        }
+		                        $read1=join(',', at ts);
+		                        $self->throw("bowtie requires at least one sequence read") unless (@ts);
+		                } else { #must be a string... fail if non-alpha
+		                        $self->throw("bowtie requires at least one valid sequence read") if $read1=~m/[[^:alpha:]]/;
+		                }
+		        }
+		        	    
+        } elsif ( -e $read1 ) { # expect a file - so test whether its appropriate
+              my $cmd = $self->command if $self->can('command');
+              my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$read1);
+              if ($cmd =~ m/^c/) {
+                      $self->carp("Reads file assumed to be crossbow format at arg 1 (no crossbow guesser implementation to confirm)");
+                      # crossbow format - general format 'name\tseq1\tqual1[\tseq2\tqual2]'
+                      # can mix single reads and paired reads
+                      # e.g.
+                      # r0	GAACGATACCCACCCAACTATCGCCATTCCAGCAT	EDCCCBAAAA@@@@?>===<;;9:99987776554
+                      # r1	TATTCTTCCGCATCCTTCATACTCCTGCCGGTCAG	EDCCCBAAAA@@@@?>===<;;9:99987776554	GAATACTGGCGGATTACCGGGGAAGCTGGAGC	EDCCCBAAAA@@@@?>===<;;9:99987776                      
+              } else {
+	              for ($guesser->guess) {
+	              	       m/^fasta$/ && do { 
+	                            ($self->fastq or $self->raw or $cmd =~ m/^c/) and $self->throw("Fasta reads file inappropriate at arg 1");
+	                            $self->fasta(1);
+	                            last;
+	              	       };
+	              	       m/^fastq$/ && do { 
+	                            ($self->fasta or $self->raw or $cmd =~ m/^c/) and $self->throw("Fastq reads file inappropriate at arg 1");
+	                            $self->fastq(1);
+	                            last;
+	              	       };
+	              	       m/^raw$/ && do { 
+	                            ($self->fasta or $self->fastq or $cmd =~ m/^c/) and $self->throw("Raw reads file inappropriate at arg 1");
+	                            $self->raw(1);
+	                            last;
+	              	       }
+	              }
+              }
+        } else {
+        	     $self->throw("bowtie sequence read file does not exist");
+        }
+        
+        return $read1;
 }
 
 =head2 _run()

Modified: bioperl-run/trunk/t/Bowtie.t
===================================================================

@@ Diff output truncated at 10000 characters. @@


More information about the Bioperl-guts-l mailing list