My Current Scraper

I'm giving up a on writing a unique scraper for every entertainment site. I'm just going to scrape all the links and deal with the mess in PostgreSQL. I removed a bunch of stuff, hopefully it still works.


Great formatting site: https://codeformatter.blogspot.com/

 #!/bin/bash  
 rss_host="POTATO"  
 if [ "$DISPLAY" == "" ]; then  
   export DISPLAY=:0  
 else  
   echo "$DISPLAY"  
 fi  
 if [ "$XDG_RUNTIME_DIR" == "" ]; then  
   export XDG_RUNTIME_DIR=/run/user/1000  
 fi  
 if [ "$QT_QPA_PLATFORM" == "" ]; then  
   export QT_QPA_PLATFORM=offscreen  
 fi  
 if [ -t 0 ]; then  
   echo "Interactive Mode"  
 else  
   echo "batch mode"  
   waittime=`expr $RANDOM / 300`  
   echo "waitting $waittime"  
   #sleep $waittime  
 fi  
 if [ "$1" != "" ]; then  
   echo "Update Scrape time/fetched"  
   psql rss -h $rss_host -t -q -c "update rss_site set rs_last_read_dt = now() - (interval '1 minutes ' * date_part('minutes',now())) - interval '1 minutes',rs_fetched =0 where rs_id = $1 ;"  
   echo "working on $1"  
   url="`psql rss -h $rss_host -t -q -c \"select rs_url from rss_site where rs_id = $1 ;\" | sed -e 's/^ *//'`"  
   echo "url:$url"  
   match="`psql rss -h $rss_host -t -q -c \"select rs_keyword from rss_site where rs_id = $1 ;\" | sed -e 's/^ *//'`"  
   echo "match:$match"  
   parentid=$1  
   #lynx -dump -listonly $url | grep -i $match | sed -e 's/#.*//' | sed -e 's/.*http/http/' | sed -e "s/^/\',/" | gawk -v parent=$parentid 'BEGIN { FS = "," } ;{print "insert into rss_entry(re_parent_id,re_url,re_title) values(",parent,",",$1.$2.$1,",lower(",$1."RSS",$2.$1,"));"}' | sed -e 's/https/http/' | psql rss -h $rss_host   
   echo "phantom scraper"  
   /usr/bin/phantomjs $HOME/bin/link_scraper.js $url | grep -i $match | sed -e 's/#.*//' | sed -e "s/^/\',/" | gawk -v parent=$parentid 'BEGIN { FS = "," } ;{print "insert into rss_entry(re_parent_id,re_url,re_title) values(",parent,",",$1.$2.$1,",lower(",$1."RSS",$2.$1,"));"}' | sed -e 's/https/http/' | psql rss -h $rss_host  
   psql rss -h $rss_host -t -q -c "update rss_site set rs_last_read_dt = now() - (interval '1 minutes ' * date_part('minutes',now())) - interval '1 minutes' where rs_id = $1 ;"  
 else  
   echo "need an rs_id"  
 fi  
 if [ "`pgrep 'phantomjs'`" != "" ]; then  
   killall -HUP phantomjs  
 fi  

 /*  
  * Name: link_scraper.js  
  * Description: Scrapes  
  * Date: 03/04/14   
  *  
  *   
  *   
  */  
 phantom.clearCookies();  
 var system = require('system');  
 function riplinks() {  
        var links = page.evaluate(function() {  
          return [].map.call(document.querySelectorAll('a'), function(link) {  
                if (link.hasAttribute('href'))  
                {  
                     if (link.getAttribute('href').toLowerCase().indexOf("http") > -1)  
                     {  
                 return link.getAttribute('href');  
                     }  
                     else  
                     {  
                 return location.protocol + '//' + location.hostname + link.getAttribute('href');  
                     }  
                }  
                else  
                {  
                     return location.protocol + '//' + location.hostname;   
                }  
          });  
        });  
        console.log(links.join('\n'));  
 }  
 if (system.args.length < 1) {  
   console.log("Usage: <url>");  
   phantom.exit(1);  
 } else {  
   var content = '',  
     f = null,  
     i;  
   for ( i= 2; i < system.args.length; ++i ) {  
     content += system.args[i] + (i === system.args.length-1 ? '' : ' ');  
   }  
      var page = require('webpage').create();  
      var url = system.args[1];  
      page.viewportSize = {  
           width: 3840,  
           height: 2160   
           };  
      page.open(url, function(status) {  
      // list all the a.href links in the hello kitty etsy page  
   window.setTimeout(function () {  
                riplinks();  
       phantom.exit();  
     }, 4000);  
      });  
 }  

Comments

Popular posts from this blog

Blocking Bad Light Sources(Sjogren's?)

Hot Spots for Possible B-Cell Disorders(Lupus/Sjogren/Chronic Fatigue)

HttpWebRequest in 3.5