Search
 
SCRIPT & CODE EXAMPLE
 
CODE EXAMPLE FOR PYTHON

scrapy itemloader example

def parse_news(self, response):
        self.logger.info('parse_news: %s' % response)

        # Initialize item loader
        # extract news title, published_at, author, content, url
        # Required: title, raw_content, published_at
        loader = ItemLoader(item=News(), response=response)
        loader.add_value('url', response.url)

        title_selectors = response.css('h1.detailtitle::text')
        if not title_selectors:
            # If error, drop from the item pipeline
            return loader.load_item()
        title = title_selectors.extract_first().strip()
        loader.add_value('title', title)

        # Parse date information
        date_time = response.css('body > div > div.container > div.page-header > div::text').extract_first().strip()
        date_time = date_time.split(',')[-1].strip()
        date_time = ' '.join([_(w) for w in date_time.split(' ')]) # October => Oktober
        try:
            published_at_wib = datetime.strptime(date_time, '%d %B %Y %H:%M')
        except ValueError:
            # If error, drop from the item pipeline
            return loader.load_item()

        published_at = wib_to_utc(published_at_wib)
        loader.add_value('published_at', published_at)

        # If multipage
        multipage_selectors = response.css('.newsPagingWrap > a')
        if multipage_selectors:
            return self.parse_indices(multipage_selectors, loader)

        # Else if not multipage

        author_name_selectors = response.css('.newsContent > p > strong::text')
        if not author_name_selectors:
            loader.add_value('author_name', '')
        else:
            author_name = author_name_selectors.extract()[-1].strip()
            loader.add_value('author_name', author_name)

        # Extract the news content
        raw_content_selectors = response.css('.newsContent > p')
        if not raw_content_selectors:
            # Drop from the item pipeline
            return loader.load_item()
            
        raw_content = ' '.join(raw_content_selectors.extract())
        raw_content = raw_content.strip()
        loader.add_value('raw_content', raw_content)

        # Move scraped news to pipeline
        return loader.load_item() 
 
PREVIOUS NEXT
Tagged: #scrapy #itemloader
ADD COMMENT
Topic
Name
5+9 =