і ще один парсер

#!/usr/bin/env python

import re
import helpers
import hashlib
from lxml import etree
from lxml import html as lxml_html
from urlparse import urlparse

_base_url = "http://efccnigeria.org/efcc/index.php/wanted?limitstart={}"
_base_site_name = urlparse(_base_url)

def cleanhtml(raw_html):
    cleanr =re.compile('<.*?>')
    cleantext = re.sub(cleanr,'', raw_html)
    return cleantext

def parse_links(div_blog, iter):
    if iter == 0:
        div_blog = div_blog[:-2]
    else:
        div_blog = div_blog[:-1]

    links = []
    for db in div_blog:
        left_div = db.getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      attrib.get('href')
        right_div = db.getchildren()[1].\
                      getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      getchildren()[0].\
                      attrib.get('href')
        links.append("%s://%s%s" % (_base_site_name[0], _base_site_name[1], left_div))
        links.append("%s://%s%s" % (_base_site_name[0], _base_site_name[1], right_div))
    return links

def build_document(content):
    name = content[0].getchildren()[0].text.lstrip().rstrip()
    description = etree.tostring(content[2].getchildren()[0], encoding="utf-8")
    description_clean = cleanhtml(description)
    description_clean = re.sub('\&\#13\;', '', description_clean)
    description_clean = re.sub('\xc2\xa0', '', description_clean)
    description_clean = "\n".join([ll.rstrip() for ll in description_clean.splitlines() if ll.strip()]) # strip blank lines
    description_clean = description_clean.split("\n")
    index = description_clean.index(' <!--')
    description_clean = description_clean[:index]
    description_clean = ' '.join(description_clean)

    entity = {
        "_meta": {
            "id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", name + description_clean.decode("utf8", errors="ignore")))).hexdigest(),
            "entity_type": "person"
        },
        "name": name,
        "types": ["pep"],
        "fields": [
            {"name": "Description", "value": description_clean.decode("utf8", errors="ignore")}
        ]
    }
    helpers.emit(entity)

def main():
    parser = lxml_html.HTMLParser(encoding='utf-8')

    tree = etree.parse(_base_url.format(0), parser)
    last_page_url = tree.findall("//li[@class='pagination-end']/a")[0].attrib.get('href')
    rx_sequence=re.compile(r"start=([0-9]+)")
    last_page = rx_sequence.search(last_page_url).group(1)

    for i in range(0, int(last_page)+1, 20):
        tree = etree.parse(_base_url.format(i), parser)
        div_blog = tree.findall("//div[@class='blog']/div")
        links = parse_links(div_blog, i)
        for link in links:
            tree = etree.parse(link, parser)
            xpath = "/html/body/div[2]/div[2]/div/div[5]/div/div/div[2]/div[2]/div/div/div[1]"
            person_info = tree.xpath(xpath)[0].getchildren()
            build_document(person_info)

if __name__ == "__main__":
    main()

Немає коментарів: