def parse_item(self, response):
""" Parse a response into a DocumentItem. """
doc_loader = ItemLoader(item=DocumentItem(), response=response)
doc_loader.add_value('url', response.url)
doc_loader.add_xpath('meta', '//meta[@name='description']/@content')
doc_loader.add_value('domain', urlparse(response.url).hostname)
doc_loader.add_xpath('title', '//title/text()')
hxs = HtmlXPathSelector(response) # For HTML extractions
# Extract links
# For each link on this page
links = []
a_links = hxs.xpath('//a')
for link in a_links:
link_obj = {}
# Extract the link's URL
link_str = " ".join(link.xpath('@href').extract())
link_obj['link'] = link_str.replace("
", "")
# Extract the links value
link_name_str = " ".join(link.xpath('text()').extract())
link_name_str = link_name_str.replace("
", "")
link_name_str = link_name_str.lstrip()
link_name_str = link_name_str.rstrip()
link_obj['link_name'] = link_name_str
links.append(link_obj)
doc_loader.add_value('links', links)
# Populate text field
title_list = hxs.xpath('//title/text()').extract()
title = ' '.join(title_list)
body_text = self.html2string(response)
text = title + " " + body_text
doc_loader.add_value('content', text)
doc_loader.add_value('raw_text', text)
doc_loader.add_value('raw_title', title)
doc_loader.add_value('raw_url', response.url)
h1_list = hxs.xpath("//h1/text()").extract()
doc_loader.add_value('h1', " ".join(h1_list))
doc_loader.add_value('content_type', response.headers['Content-type'])
doc_loader.add_value('updated_on', datetime.datetime.now().strftime(
"%Y-%m-%dT%H:%M:%S"))
item = doc_loader.load_item()
return item