def parse_news(self, response):
self.logger.info('parse_news: %s' % response)
loader = ItemLoader(item=News(), response=response)
json_response = json.loads(response.body)
try:
url = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['MoreLink']
except KeyError:
return loader.load_item()
loader.add_value('url', url)
try:
title = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['NewsLines']['HeadLine']
except KeyError:
return loader.load_item()
if not title:
return loader.load_item()
loader.add_value('title', title)
try:
raw_content = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['NewsComponent']['ContentItem']['DataContent']['nitf']['body']['body.content']['p']
except KeyError:
return loader.load_item()
if not raw_content:
return loader.load_item()
loader.add_value('raw_content', raw_content)
try:
author_name = json_response['NewsML']['NewsItem']['NewsComponent']['NewsComponent']['Author']
except KeyError:
return loader.load_item()
if not author_name:
loader.add_value('author_name', '')
else:
loader.add_value('author_name', author_name)
try:
date_time_str = json_response['NewsML']['NewsItem']['NewsManagement']['FirstCreated']
except KeyError:
return loader.load_item()
if not date_time_str:
return loader.load_item()
date_time_str = date_time_str.split('T')
date_time_str[1] = '0' * (6 - len(date_time_str[1])) + date_time_str[1]
try:
published_at_wib = datetime.strptime(' '.join(date_time_str), '%Y%m%d %H%M%S');
except Exception:
return loader.load_item()
published_at = wib_to_utc(published_at_wib)
loader.add_value('published_at', published_at)
return loader.load_item()