Колега попросив помогти знайти помилку в скрипті. Парсалось за допомогою BS, а я його ненавиджу. Переписав на lxml
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import re
from lxml import etree
from lxml import html as lxml_html
import urllib2
from io import StringIO
_base_url = "https://www.weblancer.net/projects/?page={}"
def main():
parser = lxml_html.HTMLParser(encoding='windows-1251')
page =urllib2.urlopen(_base_url.format(1))
data=page.read()
tree = etree.parse(StringIO(data.decode("windows-1251", errors="ignore")), parser)
last_page_url = tree.xpath("//ul[@class='pagination']/li[last()]/a/@href")[0]
last_page = re.compile(r"(\d+)").search(last_page_url).group(1)
for i in range(0, int(last_page)+1):
page =urllib2.urlopen(_base_url.format(i))
data=page.read()
tree = etree.parse(StringIO(data.decode("windows-1251", errors="ignore")), parser)
search_results = tree.xpath("//div[@class='container-fluid cols_table show_visited']/div[@class='row']")
for sr in search_results:
title = sr.xpath("./div[@class='col-sm-7']/a[@class='title']/text()")[0].lstrip().rstrip()
categories = sr.xpath("./div[@class='col-sm-7']/div[@class='text-muted']/a[@class='text-muted']/text()")[0].lstrip().rstrip()
try:
price = sr.xpath("./div[@class='col-sm-2 amount title']/text()")[0].lstrip().rstrip()
except IndexError:
price = ''
try:
application = sr.xpath("./div[@class='col-sm-3 text-right text-nowrap hidden-xs']/text()")[0].lstrip().rstrip()
except IndexError:
application = ''
print title, categories, application, price
if __name__ == "__main__":
main()
Немає коментарів:
Дописати коментар