[Bioperl-guts-l] bioperl-live/Bio/DB/Biblio pdf.pm,1.1,1.2

Allen Day allenday at pub.open-bio.org
Wed Feb 16 16:18:57 EST 2005


Update of /home/repository/bioperl/bioperl-live/Bio/DB/Biblio
In directory pub.open-bio.org:/tmp/cvs-serv12492/Bio/DB/Biblio

Modified Files:
	pdf.pm 
Log Message:
moving from crawl to if/else block for recognizing publisher sites.


Index: pdf.pm
===================================================================
RCS file: /home/repository/bioperl/bioperl-live/Bio/DB/Biblio/pdf.pm,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** pdf.pm	12 Feb 2005 02:44:41 -0000	1.1
--- pdf.pm	16 Feb 2005 21:18:55 -0000	1.2
***************
*** 372,441 ****
  
    #here is the treasure
!   $page =~ m|<!---- Pager -- \(page header\) -- end ------>.+?<a href="(.+?)" onClick="window.open|s;
  
    $self->ua->follow_link( url => $1 );
!   $self->_crawl();
  }
  
! sub _crawl {
!   my( $self ) = @_;
  
!   return undef if $self->depth() == $self->max_depth();
!   return undef if $self->pdf();
  
!   $self->depth( $self->depth + 1 );
  
-   #try to find "PDF" link first
-   my ( $link ) = $self->ua->find_link( text_regex => qr/PDF|View article/ );
-   if ( $link ) {
  
!     next if $visit{ $link->url() };
!     $visit{ $link->url() }++;
  
-     $self->ua->get( $link );
-     print "[" . $self->depth() . "] fetching: " . $link->url() . " " . $self->ua->ct() . "\n" if DEBUG;
  
!     #test for a likely string "href", because some misconfigured webservers will send pdf
!     #as text/html
!     if ( $self->ua->ct() eq 'application/pdf' or
!          ( $self->ua->ct() eq 'text/html' and $self->ua->content !~ /href/is )
!        ) {
!       print "*****FOUND IT (" . $link->url . ") *****\n" if DEBUG;
  
!       $self->pdf( $self->ua->content() );
!     }
!     else {
!       $self->_crawl();
!     }
!   }
!   else {
!     foreach my $link ( $self->ua->find_all_links ) {
  
!       next if $visit{ $link->url() };
!       $visit{ $link->url() }++;
  
!       $self->ua->get( $link );
!       print "[" . $self->depth() . "] fetching: " . $link->url() . " " . $self->ua->ct() . "\n" if DEBUG;
  
!       #test for a likely string "href", because some misconfigured webservers will send pdf
!       #as text/html
!       if ( $self->ua->ct() eq 'application/pdf' or 
!            ( $self->ua->ct() eq 'text/html' and $self->ua->content !~ /href/is )
!          ) {
!         print "*****FOUND IT (" . $link->url . ") *****\n" if DEBUG;
  
!         $self->pdf( $self->ua->content() );
!         last;
!       }
!       else {
!         $self->_crawl();
!       }
!     }
!   }
  
!   $self->depth( $self->depth - 1 );
  
!   return undef;
! }
  
  =head2 pdf()
--- 372,513 ----
  
    #here is the treasure
!   $page =~ m|<!---- Pager -- \(page header\) -- end ------>.+?<SPAN><a href="(.+?)" onClick="window.open|s;
! 
!   if( ! defined($1) ) {
!     return undef;
!   }
  
    $self->ua->follow_link( url => $1 );
! 
!   #uncomment this to do site crawl -- old style
!   #$self->_crawl();
! 
!   my $pdf_url = $self->guess_pdf_url($self->ua->uri);
!   $self->throw( "didn't recognize pattern in '".$self->ua->uri."', please patch module" ) unless $pdf_url;
!   $self->ua->get( $pdf_url );
!   my $content = $self->ua->content();
!   $self->pdf( $content );
  }
  
! sub guess_pdf_url {
!   my($self,$url) = @_;
  
!   #cancer research
!   if( $url =~ m!^(.+?)/cgi/content/full/(\d+)/(\d+)/(\d+)/?$! ) {
!     return qq($1/cgi/reprint/$2/$3/$4.pdf);
!   }
!   #nature
!   #http://www.nature.com/cgi-taf/DynaPage.taf?file=/onc/journal/v18/n27/abs/1202776a.html&dynoptions=doi1108513968
!   #http://www.nature.com/cgi-taf/DynaPage.taf?file=/onc/journal/v18/n27/full/1202776a.html&filetype=pdf
!   elsif( $url =~ m!^(.+?cgi-taf/DynaPage.taf.+?)/journal/(.+?)/abs/(.+?\.html)! ) {
!     return qq($1/journal/$2/full/$3\&filetype=pdf);
!   }
!   #science direct
!   #these pages contain some unpredictable md5 bullshit, so we need to parse the page
!   #http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VPM-480CTTS-5&_coverDate=04%2F30%2F2003&_alid=247076467&_rdoc=1&_fmt=&_orig=search&_qd=1&_cdi=6210&_sort=d&view=c&_acct=C000059605&_version=1&_urlVersion=0&_userid=4423&md5=8054dea49e32e98a6b30b206ea47fbfe
!   #http://www.sciencedirect.com/science?_ob=MImg&_imagekey=B6VPM-480CTTS-5-5&_cdi=6210&_user=4423&_orig=search&_coverDate=04%2F30%2F2003&_qd=1&_sk=999779997&view=c&wchp=dGLbVtz-zSkzV&md5=5b04979d84dab066be5cde52fd2affa7&ie=/sdarticle.pdf
!   elsif( $url =~ m!^(.+?science\?_ob=)ArticleURL(.+?)$! ) {
!     my $link = $self->ua->find_link( text_regex => qr/PDF \(.+?\)/s );
!     return undef unless $link;
!     return $link->url_abs();
!   }
!   #genome biology
!   #http://genomebiology.com/2003/4/7/R43
!   #http://genomebiology.com/content/pdf/gb-2003-4-7-r43.pdf
!   elsif( $url =~ m!^(.+?genomebiology.com)/(\d+)/(\d+)/(\d+)/(.+?)/?$! ) {
!     my $file = lc(sprintf("gb-%d-%d-%d-%s.pdf",$2,$3,$4,$5));
!     return qq($1/content/pdf/$file);
!   }
!   #wiley interscience
!   #http://www3.interscience.wiley.com/cgi-bin/abstract/91013753/ABSTRACT
!   #http://www3.interscience.wiley.com/cgi-bin/fulltext/91013753/PDFSTART
!   #http://download.interscience.wiley.com/cgi-bin/fulltext?ID=96515300&PLACEBO=IE.pdf&mode=pdf
!   elsif( $url =~ m!^(.+?/cgi-bin)/abstract/(\d+?)/ABSTRACT$! ) {
!     $self->ua->get( qq($1/fulltext/$2/PDFSTART) );
!     my $link = $self->ua->find_link( url_regex => qr/fulltext/ );
!     return undef unless $link;
!     return $link->url_abs();
!   }
!   #nar, bioinformatics
!   #http://nar.oupjournals.org/cgi/content/full/32/suppl_1/D258
!   #http://nar.oupjournals.org/cgi/reprint/32/suppl_1/D258.pdf
!   elsif( $url =~ m!^(.+?oupjournals.org/cgi)/reprint/(.+?)$! ) {
!     return qq($1/reprint/$2.pdf);
!   }
!   elsif( $url =~ m!^(.+?oupjournals.org/cgi)/content/full/(.+?)$! ) {
!     return qq($1/reprint/$2.pdf);
!   }
  
!   #plos
!   #http://biology.plosjournals.org/plosonline/?request=get-document&doi=10.1371/journal.pbio.0020009
!   #http://www.plosbiology.org/archive/1545-7885/2/1/pdf/10.1371_journal.pbio.0020009-S.pdf
!   elsif( $url =~ m!^http://[^.]+?\.plos! ) {
!     my $link = $self->ua->find_link( text_regex => qr/^Screen/s );
!     return undef unless $link;
!     return $link->url_abs();
!   }
!   #bmc bioinformatics
!   #http://www.biomedcentral.com/1471-2105/2/7
!   #http://www.biomedcentral.com/content/pdf/1471-2105-2-7.pdf
!   elsif( $url =~ m!^(.+?biomedcentral.+?)/(\d+\-\d+)/(\d+)/(\d+)/?$! ) {
!     my $file = lc(sprintf("%s-%d-%d.pdf",$2,$3,$4));
!     return qq($1/content/pdf/$file);
!   }
  
  
! warn $url;
!   return undef;
! }
  
  
! # sub _crawl {
! #   my( $self ) = @_;
  
! #   return undef if $self->depth() == $self->max_depth();
! #   return undef if $self->pdf();
  
! #   $self->depth( $self->depth + 1 );
  
! #   #try to find "PDF" link first
! #   my ( $link ) = $self->ua->find_link( text_regex => qr/PDF|View article/ );
! #   if ( $link ) {
! #     $self->_fetch_pdf( $link );
! #   }
! #   else {
! #     foreach my $link ( $self->ua->find_all_links ) {
! #       $self->_fetch_pdf( $link );
! #     }
! #   }
  
! #   $self->depth( $self->depth - 1 );
  
! #   return undef;
! # }
  
! # sub _fetch_pdf {
! #   my $self = shift;
! #   my $link = shift;
  
! #   return if $visit{ $link->url_abs };
! #   $visit{ $link->url_abs }++;
! 
! #   $self->ua->get( $link->url_abs );
! #   print "[" . $self->depth() . "] fetching: " . $link->url_abs . " " . $self->ua->ct() . "\n" if DEBUG;
! 
! #   #test for a likely string "href", because some misconfigured webservers will send pdf
! #   #as text/html
! #   if ( $self->ua->ct() eq 'application/pdf' or
! #        ( $self->ua->ct() =~ /text/ and $self->ua->content !~ /href|src/is )
! #      ) {
! #     print "*****FOUND IT (" . $link->url_abs . ") *****\n" if DEBUG;
! 
! #     $self->ua->get( $link->url_abs );
! #     my $content = $self->ua()->content();
! #     $self->pdf( $content );
! #   }
! #   else {
! #     $self->_crawl();
! #   }
! # }
  
  =head2 pdf()



More information about the Bioperl-guts-l mailing list