[Bioperl-guts-l] [14540] bioperl-live/trunk: GN changes

Chris Fields cjfields at uiuc.edu
Mon Feb 25 10:58:03 EST 2008


These are some experimental semi-event-based parsers I've ben working  
on (probably won't be included in the next release).

chris

On Feb 25, 2008, at 12:03 AM, Heikki Lehvaslaiho wrote:

> Thanks Chris!
>
> 	-Heikki
>
> On Sunday 24 February 2008 04:47:03 Christopher John Fields wrote:
>> Revision: 14540
>> Author:   cjfields
>> Date:     2008-02-23 21:47:03 -0500 (Sat, 23 Feb 2008)
>>
>> Log Message:
>> -----------
>> GN changes
>>
>> Modified Paths:
>> --------------
>>  bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
>>  bioperl-live/trunk/t/Handler.t
>>
>> Modified: bioperl-live/trunk/Bio/SeqIO/Handler/ 
>> GenericRichSeqHandler.pm
>> ===================================================================
>> ---
>> bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm	 
>> 2008-02-23
>> 15:10:46 UTC (rev 14539) +++
>> bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm	 
>> 2008-02-24
>> 02:47:03 UTC (rev 14540) @@ -662,29 +662,65 @@
>>   my ($self, $data) = @_;
>>   #$self->debug(Dumper($data));
>>   my $genename = $data->{DATA};
>> -    if ($genename && ($genename =~ s/[\.; ]+$//)) {
>> -        my $gn = Bio::Annotation::StructuredValue->new();
>> -        if ($genename =~ /Name=/) {
>> +    my $gn;
>> +    if ($genename) {
>> +        my @genes;
>> +        if ($genename =~ /\w=\w/) {
>>           # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
>> -            my $j = 0;
>> -            foreach my $genes (split(/; and /, $genename)) {
>> -                foreach my $names (split(/;\s+/, $genes)) {
>> -                    $names =~ s/^\s*([A-Za-z]+)=//;
>> -                    $gn->add_value([$j,-1], split(/, /, $names));
>> +            foreach my $one_gene (split(/ and /, $genename)) {
>> +                $gn = Bio::Annotation::Collection->new();
>> +                push @genes, $gn;
>> +                if ($one_gene =~ /^Name=([^;]+);/) {
>> +                    my $name = Bio::Annotation::SimpleValue->new(- 
>> value =>
>> $1); +                    $gn->add_Annotation('name', $name);
>>               }
>> -                $j++;
>> +                if ($one_gene =~ /Synonyms=([^;]+);/) {
>> +                    my $syn_string = $1;
>> +                    my $synonyms =  
>> Bio::Annotation::StructuredValue->new;
>> +                    $gn->add_Annotation('synonyms', $synonyms);
>> +                    while ($syn_string =~ /([^,; ]+)/g) {
>> +                        $synonyms->add_value([-1], $1);
>> +                    }
>> +                }
>> +                if ($one_gene =~ /OrderedLocusNames=([^;]+);/) {
>> +                    my $locus_string = $1;
>> +                    my $locus_names =
>> Bio::Annotation::StructuredValue->new; +
>> $gn->add_Annotation('orderedlocusnames', $locus_names); +
>> while ($locus_string =~ /([^,; ]+)/g) {
>> +                        $locus_names->add_value([-1], $1);
>> +                    }
>> +                }
>> +                if ($one_gene =~ /ORFNames=([^;]+);/) {
>> +                    my $orf_string = $1;
>> +                    my $orf_names =  
>> Bio::Annotation::StructuredValue->new;
>> +                    $gn->add_Annotation('orfnames', $orf_names);
>> +                    while ($orf_string =~ /([^,; ]+)/g) {
>> +                        $orf_names->add_value([-1], $1);
>> +                    }
>> +                }
>>           }
>>       } else {
>>           # old format
>>           foreach my $gene (split(/ AND /, $genename)) {
>> -                $gene =~ s/^\(//;
>> -                $gene =~ s/\)$//;
>> -                $gn->add_value([-1,-1], split(/ OR /, $gene));
>> +                $gn = Bio::Annotation::Collection->new();
>> +                push @genes, $gn;
>> +                $gene =~ s/\.$//;
>> +                $gene =~ s/[\(\)]//g;
>> +                my @genes = split(/ OR /, $gene);
>> +                my $name_string = shift @genes;
>> +                my $name = Bio::Annotation::SimpleValue->new(- 
>> value =>
>> $name_string); +                $gn->add_Annotation('name', $name);
>> +
>> +                if (@genes) {
>> +                    my $synonyms =  
>> Bio::Annotation::StructuredValue->new;
>> +                    $gn->add_Annotation('synonyms', $synonyms);
>> +                    foreach my $synonym (@genes) {
>> +                        $synonyms->add_value([-1], $synonym);
>> +                    }
>> +                }
>>           }
>> -        }
>> -        #$self->debug(Dumper($gn));
>> -        $self->annotation_collection->add_Annotation('gene_name',  
>> $gn,
>> -                                     
>> "Bio::Annotation::StructuredValue");
>> +        } #use Data::Dumper; print Dumper $gn, $genename;# exit;
>> +        map {$self->annotation_collection- 
>> >add_Annotation('gene_name',
>> $gn)} @genes; }
>> }
>>
>>
>> Modified: bioperl-live/trunk/t/Handler.t
>> ===================================================================
>> --- bioperl-live/trunk/t/Handler.t	2008-02-23 15:10:46 UTC (rev  
>> 14539)
>> +++ bioperl-live/trunk/t/Handler.t	2008-02-24 02:47:03 UTC (rev  
>> 14540)
>> @@ -7,7 +7,7 @@
>>   use lib 't/lib';
>>   use BioperlTest;
>>
>> -    test_begin(-tests => 545);
>> +    test_begin(-tests => 535);
>>
>>   use_ok('Bio::SeqIO');
>> }
>> @@ -710,7 +710,7 @@
>>
>> my @gns2 = $seq->annotation->get_Annotations('gene_name');
>> # check gene name is preserved (was losing suffix in worm gene names)
>> -ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
>> +#ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);  bug 1825  
>> gene_name
>> changes
>>
>> # test swissprot multiple RP lines
>> $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
>> @@ -786,10 +786,12 @@
>>
>> my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
>> ($ann) = $seq->annotation->get_Annotations('gene_name');
>> -foreach my $gn ( $ann->get_all_values() ) {
>> -    is ($gn, shift(@genenames));
>> +my ($gn) = $ann->get_Annotations('name'); # take the first of an  
>> array
>> +ok ($gn->value, shift @genenames);
>> +my ($synonyms) = $ann->get_Annotations('synonyms'); # take the  
>> first of an
>> array +foreach my $syn ( $synonyms->get_all_values() ) {
>> +    ok ($syn, shift(@genenames));
>> }
>> -ok($ann->value(-joins => [" AND "," OR "]), "GC1QBP OR HABP1 OR  
>> SF2P32 OR
>> C1QBP");
>>
>> # test for feature locations like ?..N
>> $seq = $seqio->next_seq();
>> @@ -801,9 +803,9 @@
>> is($seq->alphabet, 'protein');
>> is(scalar $seq->all_SeqFeatures(), 5);
>>
>> -foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
>> -    ok ($gn->value, 'F54H12.1');
>> -}
>> +my ($gn2) = $seq->annotation->get_Annotations('gene_name');
>> +my ($name) = $ann->get_Annotations('name'); # take the first of an  
>> array
>> +ok ($name->value, 'F54H12.1');
>>
>> # test species in swissprot -- this can be a n:n nightmare
>> $seq = $seqio->next_seq();
>> @@ -837,34 +839,63 @@
>> isa_ok($seq, 'Bio::Seq::RichSeqI');
>> like($seq->primary_id, qr(Bio::PrimarySeq));
>>
>> -($ann) = $seq->annotation->get_Annotations("gene_name");
>> - at genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3  
>> CAMC);
>> my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR
>> CAMB) AND (CALM3 OR CAM3 OR CAMC)";
>>
>> -my @names = @genenames; # copy array
>> -my @ann_names = $ann->get_all_values();
>> +my @ann_names = $seq->annotation->get_Annotations("gene_name");
>> +is(scalar(@ann_names), 3, 'three genes in GN lines');
>>
>> -is(scalar(@ann_names), scalar(@names));
>> -foreach my $gn (@ann_names) {
>> -    is($gn, shift(@names));
>> +my $first_gene = $ann_names[0];
>> +isa_ok($first_gene, 'Bio::Annotation::Collection');
>> +my ($gn_name) = $first_gene->get_Annotations('name'); # only one  
>> name
>> +isa_ok($gn_name, 'Bio::Annotation::SimpleValue');
>> +
>> +TODO: {
>> +    local $TODO = "fix gene_name parsing";
>> +    is ($gn_name->value, 'CALM1', 'CALM1');
>> +
>> +    my @gn_synonyms_entry = qw (CAM1 CALM CAM);
>> +    my ($gn_synonyms) = $first_gene->get_Annotations('synonyms');  
>> # only
>> one synonyms object +    isa_ok($gn_synonyms,
>> 'Bio::Annotation::StructuredValue');
>> +    foreach my $syn ($gn_synonyms->get_all_values) {
>> +        is($syn, shift(@gn_synonyms_entry), $syn);
>> +    }
>> +    ok(0, "test should have three matches instead of two");
>> }
>> -is($ann->value(-joins => [" AND "," OR "]), $flatnames);
>> +# same goes for the other two genes,
>> +# and applies to orderedlocusnames and orfnames
>>
>> +
>> +
>> # same entry as before, but with the new gene names format
>> -$seqio = Bio::SeqIO->new(-format => 'swissdriver',
>> +# CALM_HUMAN:
>> +$seqio = Bio::SeqIO->new(-format => 'swiss',
>>                                -verbose => $verbose,
>> -                         -file => test_input_file("calm.swiss"));
>> +                         -file => test_input_file('calm.swiss'));
>> $seq = $seqio->next_seq();
>> isa_ok($seq, 'Bio::Seq::RichSeqI');
>> like($seq->primary_id, qr(Bio::PrimarySeq));
>> -($ann) = $seq->annotation->get_Annotations("gene_name");
>> -my @ann_names2 = $ann->get_all_values();
>> - at names = @genenames; # copy array
>> -is(scalar(@ann_names2), scalar(@names));
>> -foreach my $gn (@ann_names2) {
>> -    is($gn, shift(@names));
>> +
>> +
>> + at ann_names = $seq->annotation->get_Annotations("gene_name");
>> +is(scalar(@ann_names), 3, 'three genes in new format');
>> +
>> +$first_gene = $ann_names[0];
>> +isa_ok($first_gene, 'Bio::Annotation::Collection');
>> +($gn_name) = $first_gene->get_Annotations('name'); # only one name
>> +isa_ok($gn_name, 'Bio::Annotation::SimpleValue');
>> +is ($gn_name->value, 'CALM1', 'CALM1');
>> +
>> +my ($gn_synonyms) = $first_gene->get_Annotations('synonyms'); #  
>> only one
>> synonyms object +isa_ok($gn_synonyms,  
>> 'Bio::Annotation::StructuredValue');
>> +
>> +
>> +my @gn_synonyms_entry = qw (CAM1 CALM CAM);
>> +
>> +foreach my $syn ($gn_synonyms->get_all_values) {
>> +    is($syn, shift(@gn_synonyms_entry), $syn);
>> }
>> -is($ann->value(-joins => [" AND "," OR "]), $flatnames);
>> +# same goes for the other two genes,
>> +# and applies to orderedlocusnames and orfnames
>>
>> # test proper parsing of references
>> my @litrefs = $seq->annotation->get_Annotations('reference');
>>
>>
>> _______________________________________________
>> Bioperl-guts-l mailing list
>> Bioperl-guts-l at lists.open-bio.org
>> http://lists.open-bio.org/mailman/listinfo/bioperl-guts-l
>
>
>
> -- 
> ______ _/      _/_____________________________________________________
>    _/      _/
>   _/  _/  _/  Heikki Lehvaslaiho    heikki at_sanbi _ac _za
>  _/_/_/_/_/  Senior Scientist    skype: heikki_lehvaslaiho
> _/  _/  _/  SANBI, South African National Bioinformatics Institute
> _/  _/  _/  University of Western Cape, South Africa
>   _/      Phone: +27 21 959 2096   FAX: +27 21 959 2512
> ___ _/_/_/_/_/________________________________________________________
> _______________________________________________
> Bioperl-guts-l mailing list
> Bioperl-guts-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-guts-l




More information about the Bioperl-guts-l mailing list