Python: Web scraper script on Linux Debian

Install basic scraper Lxml

sudo apt-get install python-lxml

 

Install QT library for getting into JS rendered web content.

sudo apt-get install python-qt4

 

Then simple example using Python 2.7 to take pollution value of one city

#!/usr/bin/python
import sys
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from PyQt4.QtWebKit import *
from lxml import html
class Render(QWebPage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebPage.__init__(self)
self.loadFinished.connect(self._loadFinished)
self.mainFrame().load(QUrl(url))
self.app.exec_()
def _loadFinished(self, result):
self.frame = self.mainFrame()
self.app.quit()
url = 'http://aqicn.org/city/warsaw/m'
r = Render(url)
result = r.frame.toHtml()
formatted_result = str(result.toAscii())
tree = html.fromstring(formatted_result)
waw = tree.xpath('//*[@id="xatzcaqv"]/text()')
print waw

 
After rendering it shoud give you result like
['71']

This is just one simple way to make own statistics using scraped data for MySQL, LibreOffice Calc charts and so on.