Парсер

#!/usr/bin/env python

import re
import helpers
import hashlib
from lxml import etree
from lxml import html as lxml_html

_base_url = "http://guernseyregistry.com/article/4036/Disqualified-Directors"

def check_children(element):
    if element.getchildren():
        return element.getchildren()[0].text
    return element.text

def build_document(member):
    date_of_disqualification, \
    applicant_for_disqualification, \
    name_of_disqualified_director, \
    period_of_disqualification, \
    end_of_disqualification_period =  [check_children(member[i]) for i in range(0, 5)]

    entity = {
        "_meta": {
            "id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", name_of_disqualified_director + date_of_disqualification))).hexdigest(),
            "entity_type": "person"
        },
        "name": name_of_disqualified_director,
        "types": ["pep"],
        "fields": [
            {"name": "Date of disqualification", "value": date_of_disqualification},
            {"name": "Applicant for disqualification", "value": applicant_for_disqualification},
            {"name": "Period of disqualification", "value": period_of_disqualification},
            {"name": "End of disqualification period", "value": end_of_disqualification_period}
        ]
    }
    helpers.emit(entity)

def main():
    parser = lxml_html.HTMLParser(encoding='utf-8')
    tree = etree.parse(_base_url, parser)
    table = tree.findall("//table[@summary='disqual directors']/tbody/tr")
    for tr in table[1:]:
        build_document(tr.getchildren())

if __name__ == "__main__":
    main()

Немає коментарів: