#!/usr/bin/env python import re import helpers import hashlib from lxml import etree from lxml import html as lxml_html _base_url = "http://guernseyregistry.com/article/4036/Disqualified-Directors" def check_children(element): if element.getchildren(): return element.getchildren()[0].text return element.text def build_document(member): date_of_disqualification, \ applicant_for_disqualification, \ name_of_disqualified_director, \ period_of_disqualification, \ end_of_disqualification_period = [check_children(member[i]) for i in range(0, 5)] entity = { "_meta": { "id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", name_of_disqualified_director + date_of_disqualification))).hexdigest(), "entity_type": "person" }, "name": name_of_disqualified_director, "types": ["pep"], "fields": [ {"name": "Date of disqualification", "value": date_of_disqualification}, {"name": "Applicant for disqualification", "value": applicant_for_disqualification}, {"name": "Period of disqualification", "value": period_of_disqualification}, {"name": "End of disqualification period", "value": end_of_disqualification_period} ] } helpers.emit(entity) def main(): parser = lxml_html.HTMLParser(encoding='utf-8') tree = etree.parse(_base_url, parser) table = tree.findall("//table[@summary='disqual directors']/tbody/tr") for tr in table[1:]: build_document(tr.getchildren()) if __name__ == "__main__": main()
Парсер
Підписатися на:
Дописати коментарі (Atom)
Немає коментарів:
Дописати коментар