[Bioperl-guts-l] [16520] bioperl-run/trunk: Crossbow file format handling implemented - unsafely.
Dan Kortschak
kortsch at dev.open-bio.org
Thu Dec 17 20:55:33 EST 2009
Revision: 16520
Author: kortsch
Date: 2009-12-17 20:55:33 -0500 (Thu, 17 Dec 2009)
Log Message:
-----------
Crossbow file format handling implemented - unsafely. Optionalised 'out' filespec. Bugfixes and crossbow addition to tests.
Modified Paths:
--------------
bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm
bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm
bioperl-run/trunk/t/Bowtie.t
Added Paths:
-----------
bioperl-run/trunk/t/data/bowtie/reads/e_coli.cb
Modified: bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm
===================================================================
--- bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm 2009-12-18 01:55:18 UTC (rev 16519)
+++ bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie/Config.pm 2009-12-18 01:55:33 UTC (rev 16520)
@@ -104,7 +104,8 @@
our @program_commands = qw(
single
paired
-); #crossbow format not implemented yet - will attempt when I see what it looks like
+ crossbow
+);
# composite commands: pseudo-commands that run a
# sequence of commands
@@ -118,7 +119,8 @@
# prefixes only for commands that take params/switches...
our %command_prefixes = (
'single' => 'one',
- 'paired' => 'par'
+ 'paired' => 'par',
+ 'crossbow' => 'crb'
);
our @program_params = qw(
@@ -308,7 +310,60 @@
'par|offrate' => 'o',
'par|memory_mapped_io' => 'mm',
'par|shared_memory' => 'shmem',
- 'par|random_seed' => 'seed'
+ 'par|random_seed' => 'seed',
+
+ 'crb|fastq' => 'q',
+ 'crb|fasta' => 'f',
+ 'crb|raw' => 'r',
+ 'crb|inline' => 'c',
+ 'crb|skip' => 's',
+ 'crb|upto' => 'u',
+ 'crb|trim5' => '5',
+ 'crb|trim3' => '3',
+ 'crb|phred33' => 'phred33-quals',
+ 'crb|phred64' => 'phred64-quals',
+ 'crb|solexa' => 'solexa-quals',
+ 'crb|solexa1_3' => 'solexa1.3-quals',
+ 'crb|integer_qual' => 'integer-quals',
+ 'crb|max_seed_mismatches' => 'n',
+ 'crb|max_qual_mismatch' => 'e',
+ 'crb|max_quality_sum' => 'Q',
+ 'crb|seed_length' => 'l',
+ 'crb|no_maq_rounding' => 'nomaqround',
+ 'crb|max_mismatches' => 'v',
+ 'crb|min_insert_size' => 'I',
+ 'crb|max_insert_size' => 'X',
+ 'crb|forward_reverse' => 'fr',
+ 'crb|reverse_forward' => 'rf',
+ 'crb|forward_forward' => 'ff',
+ 'crb|no_forward_alignment' => 'nofw',
+ 'crb|no_reverse_alignment' => 'norc',
+ 'crb|max_backtracks' => 'maxbts',
+ 'crb|max_mate_attempts' => 'pairtries',
+ 'crb|try_hard' => 'y',
+ 'crb|max_search_ram' => 'chunkmbs',
+ 'crb|report_n_alignments' => 'k',
+ 'crb|all' => 'a',
+ 'crb|supress' => 'm',
+ 'crb|best' => 'best',
+ 'crb|strata' => 'strata',
+ 'crb|fix_strand_bias' => 'strandfix',
+ 'crb|sam_format' => 'S',
+ 'crb|concise' => 'concise',
+ 'crb|time' => 't',
+ 'crb|offset_base' => 'B',
+ 'crb|quiet' => 'quiet',
+ 'crb|ref_map' => 'refout',
+ 'crb|ref_index' => 'refidx',
+ 'crb|alignmed_file' => 'al',
+ 'crb|unaligned_file' => 'un',
+ 'crb|excess_file' => 'max',
+ 'crb|full_ref_name' => 'fullref',
+ 'crb|threads' => 'p',
+ 'crb|offrate' => 'o',
+ 'crb|memory_mapped_io' => 'mm',
+ 'crb|shared_memory' => 'shmem',
+ 'crb|random_seed' => 'seed'
);
#
@@ -326,17 +381,20 @@
#
our %command_files = (
- 'single' => [qw( ind seq out )],
- 'paired' => [qw( ind seq seq2 out )]
- ); #crossbow format not implemented yet - will attempt when I see what it looks like
+ 'single' => [qw( ind seq #out )],
+ 'paired' => [qw( ind seq seq2 #out )],
+ 'crossbow' => [qw( ind seq #out )]
+ );
INIT {
# bowtie doesn't really have subprograms so we do it this way
foreach (@program_params) {
push @program_params, "par\|".$1 if (m/^one\|(.*)/);
+ push @program_params, "crb\|".$1 if (m/^par\|(.*)/);
}
foreach (@program_switches) {
push @program_switches, "par\|".$1 if (m/^one\|(.*)/);
+ push @program_switches, "crb\|".$1 if (m/^par\|(.*)/);
}
# # add subcommand params and switches for
@@ -362,4 +420,4 @@
# # translations for subcmd params/switches not necessary
}
-1;
\ No newline at end of file
+1;
Modified: bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm
===================================================================
--- bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm 2009-12-18 01:55:18 UTC (rev 16519)
+++ bioperl-run/trunk/lib/Bio/Tools/Run/Bowtie.pm 2009-12-18 01:55:33 UTC (rev 16520)
@@ -362,11 +362,11 @@
my $index=shift @files;
for ($cmd) {
/^p/ && do {
- @files = map { ( $_ , shift @files ) } ('-1','-2','');
+ @files = map { ( $_ , shift @files ) } ('-1','-2',undef);
last;
};
- /^c/ && do { # this will deal with crossbow files when I sort them out
- @files = unshift(@files,'--12','');
+ /^c/ && do {
+ @files = map { ( $_ , shift @files ) } ('--12',undef,undef);
last;
}
}
@@ -456,43 +456,73 @@
sub _prepare_input_sequences {
- my ($self, @args) = @_;
- my (%args, $read1);
- if (grep (/^-/, @args)) { # named parms
- $self->throw("Input args not an even number") unless !(@args % 2);
- %args = @args;
- ($read1) = @args{qw( -read1 )};
- } else {
- ($read1) = @args;
- }
+ my ($self, @args) = @_;
+ my (%args, $read1);
+ if (grep (/^-/, @args)) { # named parms
+ $self->throw("Input args not an even number") unless !(@args % 2);
+ %args = @args;
+ ($read1) = @args{qw( -read1 )};
+ } else {
+ ($read1) = @args;
+ }
- # Could use the AssemblerBase routine for this, except that would not permit
- # an array of strings - not decided at this stage.
-
- if (-e $read1) { # we have a file
- my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$read1);
- $guesser->guess =~ m/^fast[qa]$/ or $self->throw("Reads file doesn't look like fasta/q at arg 1");
- } elsif ($read1->isa("Bio::PrimarySeqI")) { # we have a Bio::*Seq*
- $read1=$read1->seq();
- } else { # we have something else
- if (ref($read1) =~ /ARRAY/i) {
- my @ts;
- foreach my $seq (@$read1) {
- if ($seq->isa("Bio::PrimarySeqI")) {
- $seq=$seq->seq();
- } else {
- next if $read1=~m/[[^:alpha:]]/;
- }
- push @ts,$seq;
- }
- $read1=join(',', at ts);
- $self->throw("bowtie requires at least one sequence read") unless (@ts);
- } else { #must be a string... fail if non-alpha
- $self->throw("bowtie requires at least one sequence read") if $read1=~m/[[^:alpha:]]/;
- }
- }
-
- return $read1;
+ # Could use the AssemblerBase routine for this, except that would not permit
+ # an array of strings - not decided at this stage.
+ if ($self->inline) { # expect inline data
+ if ($read1->isa("Bio::PrimarySeqI")) { # we have a Bio::*Seq*
+ $read1=$read1->seq();
+ } else { # we have something else
+ if (ref($read1) =~ /ARRAY/i) {
+ my @ts;
+ foreach my $seq (@$read1) {
+ if ($seq->isa("Bio::PrimarySeqI")) {
+ $seq=$seq->seq();
+ } else {
+ next if $read1=~m/[[^:alpha:]]/;
+ }
+ push @ts,$seq;
+ }
+ $read1=join(',', at ts);
+ $self->throw("bowtie requires at least one sequence read") unless (@ts);
+ } else { #must be a string... fail if non-alpha
+ $self->throw("bowtie requires at least one valid sequence read") if $read1=~m/[[^:alpha:]]/;
+ }
+ }
+
+ } elsif ( -e $read1 ) { # expect a file - so test whether its appropriate
+ my $cmd = $self->command if $self->can('command');
+ my $guesser = Bio::Tools::GuessSeqFormat->new(-file=>$read1);
+ if ($cmd =~ m/^c/) {
+ $self->carp("Reads file assumed to be crossbow format at arg 1 (no crossbow guesser implementation to confirm)");
+ # crossbow format - general format 'name\tseq1\tqual1[\tseq2\tqual2]'
+ # can mix single reads and paired reads
+ # e.g.
+ # r0 GAACGATACCCACCCAACTATCGCCATTCCAGCAT EDCCCBAAAA@@@@?>===<;;9:99987776554
+ # r1 TATTCTTCCGCATCCTTCATACTCCTGCCGGTCAG EDCCCBAAAA@@@@?>===<;;9:99987776554 GAATACTGGCGGATTACCGGGGAAGCTGGAGC EDCCCBAAAA@@@@?>===<;;9:99987776
+ } else {
+ for ($guesser->guess) {
+ m/^fasta$/ && do {
+ ($self->fastq or $self->raw or $cmd =~ m/^c/) and $self->throw("Fasta reads file inappropriate at arg 1");
+ $self->fasta(1);
+ last;
+ };
+ m/^fastq$/ && do {
+ ($self->fasta or $self->raw or $cmd =~ m/^c/) and $self->throw("Fastq reads file inappropriate at arg 1");
+ $self->fastq(1);
+ last;
+ };
+ m/^raw$/ && do {
+ ($self->fasta or $self->fastq or $cmd =~ m/^c/) and $self->throw("Raw reads file inappropriate at arg 1");
+ $self->raw(1);
+ last;
+ }
+ }
+ }
+ } else {
+ $self->throw("bowtie sequence read file does not exist");
+ }
+
+ return $read1;
}
=head2 _run()
Modified: bioperl-run/trunk/t/Bowtie.t
===================================================================
@@ Diff output truncated at 10000 characters. @@
More information about the Bioperl-guts-l
mailing list