#!/usr/bin/env python
import re
import helpers
import hashlib
from lxml import etree
from lxml import html as lxml_html
from urlparse import urlparse
_base_url = "http://efccnigeria.org/efcc/index.php/wanted?limitstart={}"
_base_site_name = urlparse(_base_url)
def cleanhtml(raw_html):
cleanr =re.compile('<.*?>')
cleantext = re.sub(cleanr,'', raw_html)
return cleantext
def parse_links(div_blog, iter):
if iter == 0:
div_blog = div_blog[:-2]
else:
div_blog = div_blog[:-1]
links = []
for db in div_blog:
left_div = db.getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
attrib.get('href')
right_div = db.getchildren()[1].\
getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
getchildren()[0].\
attrib.get('href')
links.append("%s://%s%s" % (_base_site_name[0], _base_site_name[1], left_div))
links.append("%s://%s%s" % (_base_site_name[0], _base_site_name[1], right_div))
return links
def build_document(content):
name = content[0].getchildren()[0].text.lstrip().rstrip()
description = etree.tostring(content[2].getchildren()[0], encoding="utf-8")
description_clean = cleanhtml(description)
description_clean = re.sub('\&\#13\;', '', description_clean)
description_clean = re.sub('\xc2\xa0', '', description_clean)
description_clean = "\n".join([ll.rstrip() for ll in description_clean.splitlines() if ll.strip()]) # strip blank lines
description_clean = description_clean.split("\n")
index = description_clean.index(' <!--')
description_clean = description_clean[:index]
description_clean = ' '.join(description_clean)
entity = {
"_meta": {
"id": hashlib.sha224((re.sub("[^a-zA-Z0-9]", "", name + description_clean.decode("utf8", errors="ignore")))).hexdigest(),
"entity_type": "person"
},
"name": name,
"types": ["pep"],
"fields": [
{"name": "Description", "value": description_clean.decode("utf8", errors="ignore")}
]
}
helpers.emit(entity)
def main():
parser = lxml_html.HTMLParser(encoding='utf-8')
tree = etree.parse(_base_url.format(0), parser)
last_page_url = tree.findall("//li[@class='pagination-end']/a")[0].attrib.get('href')
rx_sequence=re.compile(r"start=([0-9]+)")
last_page = rx_sequence.search(last_page_url).group(1)
for i in range(0, int(last_page)+1, 20):
tree = etree.parse(_base_url.format(i), parser)
div_blog = tree.findall("//div[@class='blog']/div")
links = parse_links(div_blog, i)
for link in links:
tree = etree.parse(link, parser)
xpath = "/html/body/div[2]/div[2]/div/div[5]/div/div/div[2]/div[2]/div/div/div[1]"
person_info = tree.xpath(xpath)[0].getchildren()
build_document(person_info)
if __name__ == "__main__":
main()
і ще один парсер
Підписатися на:
Дописати коментарі (Atom)
Немає коментарів:
Дописати коментар