My Current Scraper
I'm giving up a on writing a unique scraper for every entertainment site. I'm just going to scrape all the links and deal with the mess in PostgreSQL. I removed a bunch of stuff, hopefully it still works.
Great formatting site: https://codeformatter.blogspot.com/
Great formatting site: https://codeformatter.blogspot.com/
#!/bin/bash
rss_host="POTATO"
if [ "$DISPLAY" == "" ]; then
export DISPLAY=:0
else
echo "$DISPLAY"
fi
if [ "$XDG_RUNTIME_DIR" == "" ]; then
export XDG_RUNTIME_DIR=/run/user/1000
fi
if [ "$QT_QPA_PLATFORM" == "" ]; then
export QT_QPA_PLATFORM=offscreen
fi
if [ -t 0 ]; then
echo "Interactive Mode"
else
echo "batch mode"
waittime=`expr $RANDOM / 300`
echo "waitting $waittime"
#sleep $waittime
fi
if [ "$1" != "" ]; then
echo "Update Scrape time/fetched"
psql rss -h $rss_host -t -q -c "update rss_site set rs_last_read_dt = now() - (interval '1 minutes ' * date_part('minutes',now())) - interval '1 minutes',rs_fetched =0 where rs_id = $1 ;"
echo "working on $1"
url="`psql rss -h $rss_host -t -q -c \"select rs_url from rss_site where rs_id = $1 ;\" | sed -e 's/^ *//'`"
echo "url:$url"
match="`psql rss -h $rss_host -t -q -c \"select rs_keyword from rss_site where rs_id = $1 ;\" | sed -e 's/^ *//'`"
echo "match:$match"
parentid=$1
#lynx -dump -listonly $url | grep -i $match | sed -e 's/#.*//' | sed -e 's/.*http/http/' | sed -e "s/^/\',/" | gawk -v parent=$parentid 'BEGIN { FS = "," } ;{print "insert into rss_entry(re_parent_id,re_url,re_title) values(",parent,",",$1.$2.$1,",lower(",$1."RSS",$2.$1,"));"}' | sed -e 's/https/http/' | psql rss -h $rss_host
echo "phantom scraper"
/usr/bin/phantomjs $HOME/bin/link_scraper.js $url | grep -i $match | sed -e 's/#.*//' | sed -e "s/^/\',/" | gawk -v parent=$parentid 'BEGIN { FS = "," } ;{print "insert into rss_entry(re_parent_id,re_url,re_title) values(",parent,",",$1.$2.$1,",lower(",$1."RSS",$2.$1,"));"}' | sed -e 's/https/http/' | psql rss -h $rss_host
psql rss -h $rss_host -t -q -c "update rss_site set rs_last_read_dt = now() - (interval '1 minutes ' * date_part('minutes',now())) - interval '1 minutes' where rs_id = $1 ;"
else
echo "need an rs_id"
fi
if [ "`pgrep 'phantomjs'`" != "" ]; then
killall -HUP phantomjs
fi
/*
* Name: link_scraper.js
* Description: Scrapes
* Date: 03/04/14
*
*
*
*/
phantom.clearCookies();
var system = require('system');
function riplinks() {
var links = page.evaluate(function() {
return [].map.call(document.querySelectorAll('a'), function(link) {
if (link.hasAttribute('href'))
{
if (link.getAttribute('href').toLowerCase().indexOf("http") > -1)
{
return link.getAttribute('href');
}
else
{
return location.protocol + '//' + location.hostname + link.getAttribute('href');
}
}
else
{
return location.protocol + '//' + location.hostname;
}
});
});
console.log(links.join('\n'));
}
if (system.args.length < 1) {
console.log("Usage: <url>");
phantom.exit(1);
} else {
var content = '',
f = null,
i;
for ( i= 2; i < system.args.length; ++i ) {
content += system.args[i] + (i === system.args.length-1 ? '' : ' ');
}
var page = require('webpage').create();
var url = system.args[1];
page.viewportSize = {
width: 3840,
height: 2160
};
page.open(url, function(status) {
// list all the a.href links in the hello kitty etsy page
window.setTimeout(function () {
riplinks();
phantom.exit();
}, 4000);
});
}
Comments
Post a Comment