[Bioperl-guts-l] [14540] bioperl-live/trunk: GN changes
Chris Fields
cjfields at uiuc.edu
Mon Feb 25 10:58:03 EST 2008
These are some experimental semi-event-based parsers I've ben working
on (probably won't be included in the next release).
chris
On Feb 25, 2008, at 12:03 AM, Heikki Lehvaslaiho wrote:
> Thanks Chris!
>
> -Heikki
>
> On Sunday 24 February 2008 04:47:03 Christopher John Fields wrote:
>> Revision: 14540
>> Author: cjfields
>> Date: 2008-02-23 21:47:03 -0500 (Sat, 23 Feb 2008)
>>
>> Log Message:
>> -----------
>> GN changes
>>
>> Modified Paths:
>> --------------
>> bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
>> bioperl-live/trunk/t/Handler.t
>>
>> Modified: bioperl-live/trunk/Bio/SeqIO/Handler/
>> GenericRichSeqHandler.pm
>> ===================================================================
>> ---
>> bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
>> 2008-02-23
>> 15:10:46 UTC (rev 14539) +++
>> bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
>> 2008-02-24
>> 02:47:03 UTC (rev 14540) @@ -662,29 +662,65 @@
>> my ($self, $data) = @_;
>> #$self->debug(Dumper($data));
>> my $genename = $data->{DATA};
>> - if ($genename && ($genename =~ s/[\.; ]+$//)) {
>> - my $gn = Bio::Annotation::StructuredValue->new();
>> - if ($genename =~ /Name=/) {
>> + my $gn;
>> + if ($genename) {
>> + my @genes;
>> + if ($genename =~ /\w=\w/) {
>> # new format (e.g., Name=RCHY1; Synonyms=ZNF363, CHIMP)
>> - my $j = 0;
>> - foreach my $genes (split(/; and /, $genename)) {
>> - foreach my $names (split(/;\s+/, $genes)) {
>> - $names =~ s/^\s*([A-Za-z]+)=//;
>> - $gn->add_value([$j,-1], split(/, /, $names));
>> + foreach my $one_gene (split(/ and /, $genename)) {
>> + $gn = Bio::Annotation::Collection->new();
>> + push @genes, $gn;
>> + if ($one_gene =~ /^Name=([^;]+);/) {
>> + my $name = Bio::Annotation::SimpleValue->new(-
>> value =>
>> $1); + $gn->add_Annotation('name', $name);
>> }
>> - $j++;
>> + if ($one_gene =~ /Synonyms=([^;]+);/) {
>> + my $syn_string = $1;
>> + my $synonyms =
>> Bio::Annotation::StructuredValue->new;
>> + $gn->add_Annotation('synonyms', $synonyms);
>> + while ($syn_string =~ /([^,; ]+)/g) {
>> + $synonyms->add_value([-1], $1);
>> + }
>> + }
>> + if ($one_gene =~ /OrderedLocusNames=([^;]+);/) {
>> + my $locus_string = $1;
>> + my $locus_names =
>> Bio::Annotation::StructuredValue->new; +
>> $gn->add_Annotation('orderedlocusnames', $locus_names); +
>> while ($locus_string =~ /([^,; ]+)/g) {
>> + $locus_names->add_value([-1], $1);
>> + }
>> + }
>> + if ($one_gene =~ /ORFNames=([^;]+);/) {
>> + my $orf_string = $1;
>> + my $orf_names =
>> Bio::Annotation::StructuredValue->new;
>> + $gn->add_Annotation('orfnames', $orf_names);
>> + while ($orf_string =~ /([^,; ]+)/g) {
>> + $orf_names->add_value([-1], $1);
>> + }
>> + }
>> }
>> } else {
>> # old format
>> foreach my $gene (split(/ AND /, $genename)) {
>> - $gene =~ s/^\(//;
>> - $gene =~ s/\)$//;
>> - $gn->add_value([-1,-1], split(/ OR /, $gene));
>> + $gn = Bio::Annotation::Collection->new();
>> + push @genes, $gn;
>> + $gene =~ s/\.$//;
>> + $gene =~ s/[\(\)]//g;
>> + my @genes = split(/ OR /, $gene);
>> + my $name_string = shift @genes;
>> + my $name = Bio::Annotation::SimpleValue->new(-
>> value =>
>> $name_string); + $gn->add_Annotation('name', $name);
>> +
>> + if (@genes) {
>> + my $synonyms =
>> Bio::Annotation::StructuredValue->new;
>> + $gn->add_Annotation('synonyms', $synonyms);
>> + foreach my $synonym (@genes) {
>> + $synonyms->add_value([-1], $synonym);
>> + }
>> + }
>> }
>> - }
>> - #$self->debug(Dumper($gn));
>> - $self->annotation_collection->add_Annotation('gene_name',
>> $gn,
>> -
>> "Bio::Annotation::StructuredValue");
>> + } #use Data::Dumper; print Dumper $gn, $genename;# exit;
>> + map {$self->annotation_collection-
>> >add_Annotation('gene_name',
>> $gn)} @genes; }
>> }
>>
>>
>> Modified: bioperl-live/trunk/t/Handler.t
>> ===================================================================
>> --- bioperl-live/trunk/t/Handler.t 2008-02-23 15:10:46 UTC (rev
>> 14539)
>> +++ bioperl-live/trunk/t/Handler.t 2008-02-24 02:47:03 UTC (rev
>> 14540)
>> @@ -7,7 +7,7 @@
>> use lib 't/lib';
>> use BioperlTest;
>>
>> - test_begin(-tests => 545);
>> + test_begin(-tests => 535);
>>
>> use_ok('Bio::SeqIO');
>> }
>> @@ -710,7 +710,7 @@
>>
>> my @gns2 = $seq->annotation->get_Annotations('gene_name');
>> # check gene name is preserved (was losing suffix in worm gene names)
>> -ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value);
>> +#ok($#gns2 == 0 && $gns[0]->value eq $gns2[0]->value); bug 1825
>> gene_name
>> changes
>>
>> # test swissprot multiple RP lines
>> $str = Bio::SeqIO->new(-file => test_input_file('P33897'));
>> @@ -786,10 +786,12 @@
>>
>> my @genenames = qw(GC1QBP HABP1 SF2P32 C1QBP);
>> ($ann) = $seq->annotation->get_Annotations('gene_name');
>> -foreach my $gn ( $ann->get_all_values() ) {
>> - is ($gn, shift(@genenames));
>> +my ($gn) = $ann->get_Annotations('name'); # take the first of an
>> array
>> +ok ($gn->value, shift @genenames);
>> +my ($synonyms) = $ann->get_Annotations('synonyms'); # take the
>> first of an
>> array +foreach my $syn ( $synonyms->get_all_values() ) {
>> + ok ($syn, shift(@genenames));
>> }
>> -ok($ann->value(-joins => [" AND "," OR "]), "GC1QBP OR HABP1 OR
>> SF2P32 OR
>> C1QBP");
>>
>> # test for feature locations like ?..N
>> $seq = $seqio->next_seq();
>> @@ -801,9 +803,9 @@
>> is($seq->alphabet, 'protein');
>> is(scalar $seq->all_SeqFeatures(), 5);
>>
>> -foreach my $gn ( $seq->annotation->get_Annotations('gene_name') ) {
>> - ok ($gn->value, 'F54H12.1');
>> -}
>> +my ($gn2) = $seq->annotation->get_Annotations('gene_name');
>> +my ($name) = $ann->get_Annotations('name'); # take the first of an
>> array
>> +ok ($name->value, 'F54H12.1');
>>
>> # test species in swissprot -- this can be a n:n nightmare
>> $seq = $seqio->next_seq();
>> @@ -837,34 +839,63 @@
>> isa_ok($seq, 'Bio::Seq::RichSeqI');
>> like($seq->primary_id, qr(Bio::PrimarySeq));
>>
>> -($ann) = $seq->annotation->get_Annotations("gene_name");
>> - at genenames = qw(CALM1 CAM1 CALM CAM CALM2 CAM2 CAMB CALM3 CAM3
>> CAMC);
>> my $flatnames = "(CALM1 OR CAM1 OR CALM OR CAM) AND (CALM2 OR CAM2 OR
>> CAMB) AND (CALM3 OR CAM3 OR CAMC)";
>>
>> -my @names = @genenames; # copy array
>> -my @ann_names = $ann->get_all_values();
>> +my @ann_names = $seq->annotation->get_Annotations("gene_name");
>> +is(scalar(@ann_names), 3, 'three genes in GN lines');
>>
>> -is(scalar(@ann_names), scalar(@names));
>> -foreach my $gn (@ann_names) {
>> - is($gn, shift(@names));
>> +my $first_gene = $ann_names[0];
>> +isa_ok($first_gene, 'Bio::Annotation::Collection');
>> +my ($gn_name) = $first_gene->get_Annotations('name'); # only one
>> name
>> +isa_ok($gn_name, 'Bio::Annotation::SimpleValue');
>> +
>> +TODO: {
>> + local $TODO = "fix gene_name parsing";
>> + is ($gn_name->value, 'CALM1', 'CALM1');
>> +
>> + my @gn_synonyms_entry = qw (CAM1 CALM CAM);
>> + my ($gn_synonyms) = $first_gene->get_Annotations('synonyms');
>> # only
>> one synonyms object + isa_ok($gn_synonyms,
>> 'Bio::Annotation::StructuredValue');
>> + foreach my $syn ($gn_synonyms->get_all_values) {
>> + is($syn, shift(@gn_synonyms_entry), $syn);
>> + }
>> + ok(0, "test should have three matches instead of two");
>> }
>> -is($ann->value(-joins => [" AND "," OR "]), $flatnames);
>> +# same goes for the other two genes,
>> +# and applies to orderedlocusnames and orfnames
>>
>> +
>> +
>> # same entry as before, but with the new gene names format
>> -$seqio = Bio::SeqIO->new(-format => 'swissdriver',
>> +# CALM_HUMAN:
>> +$seqio = Bio::SeqIO->new(-format => 'swiss',
>> -verbose => $verbose,
>> - -file => test_input_file("calm.swiss"));
>> + -file => test_input_file('calm.swiss'));
>> $seq = $seqio->next_seq();
>> isa_ok($seq, 'Bio::Seq::RichSeqI');
>> like($seq->primary_id, qr(Bio::PrimarySeq));
>> -($ann) = $seq->annotation->get_Annotations("gene_name");
>> -my @ann_names2 = $ann->get_all_values();
>> - at names = @genenames; # copy array
>> -is(scalar(@ann_names2), scalar(@names));
>> -foreach my $gn (@ann_names2) {
>> - is($gn, shift(@names));
>> +
>> +
>> + at ann_names = $seq->annotation->get_Annotations("gene_name");
>> +is(scalar(@ann_names), 3, 'three genes in new format');
>> +
>> +$first_gene = $ann_names[0];
>> +isa_ok($first_gene, 'Bio::Annotation::Collection');
>> +($gn_name) = $first_gene->get_Annotations('name'); # only one name
>> +isa_ok($gn_name, 'Bio::Annotation::SimpleValue');
>> +is ($gn_name->value, 'CALM1', 'CALM1');
>> +
>> +my ($gn_synonyms) = $first_gene->get_Annotations('synonyms'); #
>> only one
>> synonyms object +isa_ok($gn_synonyms,
>> 'Bio::Annotation::StructuredValue');
>> +
>> +
>> +my @gn_synonyms_entry = qw (CAM1 CALM CAM);
>> +
>> +foreach my $syn ($gn_synonyms->get_all_values) {
>> + is($syn, shift(@gn_synonyms_entry), $syn);
>> }
>> -is($ann->value(-joins => [" AND "," OR "]), $flatnames);
>> +# same goes for the other two genes,
>> +# and applies to orderedlocusnames and orfnames
>>
>> # test proper parsing of references
>> my @litrefs = $seq->annotation->get_Annotations('reference');
>>
>>
>> _______________________________________________
>> Bioperl-guts-l mailing list
>> Bioperl-guts-l at lists.open-bio.org
>> http://lists.open-bio.org/mailman/listinfo/bioperl-guts-l
>
>
>
> --
> ______ _/ _/_____________________________________________________
> _/ _/
> _/ _/ _/ Heikki Lehvaslaiho heikki at_sanbi _ac _za
> _/_/_/_/_/ Senior Scientist skype: heikki_lehvaslaiho
> _/ _/ _/ SANBI, South African National Bioinformatics Institute
> _/ _/ _/ University of Western Cape, South Africa
> _/ Phone: +27 21 959 2096 FAX: +27 21 959 2512
> ___ _/_/_/_/_/________________________________________________________
> _______________________________________________
> Bioperl-guts-l mailing list
> Bioperl-guts-l at lists.open-bio.org
> http://lists.open-bio.org/mailman/listinfo/bioperl-guts-l
More information about the Bioperl-guts-l
mailing list