#!/usr/bin/env python
import re
import helpers
import hashlib
from lxml import etree
from lxml import html as lxml_html
_base_url = "http://guernseyregistry.com/article/4036/Disqualified-Directors"
def check_children(element):
if element.getchildren():
return element.getchildren()[0].text
return element.text
def build_document(member):
date_of_disqualification, \
applicant_for_disqualification, \
name_of_disqualified_director, \
period_of_disqualification, \
end_of_disqualification_period = [check_children(member[i]) for i in range(0, 5)]
entity = {
"_meta": {
"id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", name_of_disqualified_director + date_of_disqualification))).hexdigest(),
"entity_type": "person"
},
"name": name_of_disqualified_director,
"types": ["pep"],
"fields": [
{"name": "Date of disqualification", "value": date_of_disqualification},
{"name": "Applicant for disqualification", "value": applicant_for_disqualification},
{"name": "Period of disqualification", "value": period_of_disqualification},
{"name": "End of disqualification period", "value": end_of_disqualification_period}
]
}
helpers.emit(entity)
def main():
parser = lxml_html.HTMLParser(encoding='utf-8')
tree = etree.parse(_base_url, parser)
table = tree.findall("//table[@summary='disqual directors']/tbody/tr")
for tr in table[1:]:
build_document(tr.getchildren())
if __name__ == "__main__":
main()
Парсер
Підписатися на:
Дописати коментарі (Atom)
Немає коментарів:
Дописати коментар