#!/usr/bin/env python import re import helpers import hashlib from lxml import etree from lxml import html as lxml_html from urlparse import urlparse _base_url = "http://efccnigeria.org/efcc/index.php/wanted?limitstart={}" _base_site_name = urlparse(_base_url) def cleanhtml(raw_html): cleanr =re.compile('<.*?>') cleantext = re.sub(cleanr,'', raw_html) return cleantext def parse_links(div_blog, iter): if iter == 0: div_blog = div_blog[:-2] else: div_blog = div_blog[:-1] links = [] for db in div_blog: left_div = db.getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ attrib.get('href') right_div = db.getchildren()[1].\ getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ getchildren()[0].\ attrib.get('href') links.append("%s://%s%s" % (_base_site_name[0], _base_site_name[1], left_div)) links.append("%s://%s%s" % (_base_site_name[0], _base_site_name[1], right_div)) return links def build_document(content): name = content[0].getchildren()[0].text.lstrip().rstrip() description = etree.tostring(content[2].getchildren()[0], encoding="utf-8") description_clean = cleanhtml(description) description_clean = re.sub('\&\#13\;', '', description_clean) description_clean = re.sub('\xc2\xa0', '', description_clean) description_clean = "\n".join([ll.rstrip() for ll in description_clean.splitlines() if ll.strip()]) # strip blank lines description_clean = description_clean.split("\n") index = description_clean.index(' <!--') description_clean = description_clean[:index] description_clean = ' '.join(description_clean) entity = { "_meta": { "id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", name + description_clean.decode("utf8", errors="ignore")))).hexdigest(), "entity_type": "person" }, "name": name, "types": ["pep"], "fields": [ {"name": "Description", "value": description_clean.decode("utf8", errors="ignore")} ] } helpers.emit(entity) def main(): parser = lxml_html.HTMLParser(encoding='utf-8') tree = etree.parse(_base_url.format(0), parser) last_page_url = tree.findall("//li[@class='pagination-end']/a")[0].attrib.get('href') rx_sequence=re.compile(r"start=([0-9]+)") last_page = rx_sequence.search(last_page_url).group(1) for i in range(0, int(last_page)+1, 20): tree = etree.parse(_base_url.format(i), parser) div_blog = tree.findall("//div[@class='blog']/div") links = parse_links(div_blog, i) for link in links: tree = etree.parse(link, parser) xpath = "/html/body/div[2]/div[2]/div/div[5]/div/div/div[2]/div[2]/div/div/div[1]" person_info = tree.xpath(xpath)[0].getchildren() build_document(person_info) if __name__ == "__main__": main()
і ще один парсер
Підписатися на:
Дописати коментарі (Atom)
Немає коментарів:
Дописати коментар