Python – How to pass a Scrapy object item to the image pipeline

How to pass a Scrapy object item to the image pipeline… here is a solution to the problem.

How to pass a Scrapy object item to the image pipeline

I have a spider that downloads jpg for a specific website. In the past, I parsed response.url in the image pipeline to rename the file when it was downloaded. The problem is that the site’s directory structure is strange, so parsing the image_urls to rename the target file doesn’t work. As a workaround, I just use the original drawing name as the file.

I

want to use data from the actual Scrapy object itself, but I can’t seem to pass variables from the spider into the image pipeline. From the code below, I want to parse the url in the spider and pass it as a variable to otImagesPipeline in the pipeline, but nothing works. I tried looking at the Scrapy docs but couldn’t find how to do this.

Is Scrapy OK?

Here is my crawler code :

Set .py:

BOT_NAME = 'bid'
MEDIA_ALLOW_REDIRECTS = True
SPIDER_MODULES = ['bid.spiders']
NEWSPIDER_MODULE = 'bid.spiders'
ITEM_PIPELINES = {'bid.pipelines.otImagesPipeline': 1}  
IMAGES_STORE = 'C:\\temp\\images\\filenametest'  

pipelines.py

import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline

class otImagesPipeline(ImagesPipeline):
    def file_path(self, request, response=None, info=None):
        targetfile = request.url.split('/')[-1]
        return targetfile

items.py

import scrapy

class BidItem(scrapy. Item):
    url = scrapy. Field()
    title = scrapy. Field()
    caption = scrapy. Field()
    image_urls = scrapy. Field()

getbid.py (spiders).

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bid.items import BidItem
from urllib import parse as urlparse

class GetbidSpider(CrawlSpider):
    name = 'getbid'
    allowed_domains = ['example.com']
    start_urls = ['http://www.example.com']

rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

def parse_item(self, response):
        for sel in response.xpath('//a'):
          link = str(sel.xpath('@href').extract()[0])
          if (link.endswith('.jpg')):
            href = BidItem()
            href['url'] = response.url
            href['title'] = response.css("h1.entry-title::text").extract_first()
            href['caption'] = response.css("p.wp-caption-text::text").extract()
            href['image_urls'] = [link]
            yield href
            yield scrapy. Request(urlparse.urljoin('http://www.example.com/',link),callback=self.parse_item)

Update

Thanks to Umair’s help, I was able to fix it as I needed.
Here is the modified code:

getbid.py

    def parse_item(self, response):
        for sel in response.xpath('//a'):
          link = str(sel.xpath('@href').extract()[0])
          if (link.endswith('.jpg')):
            href = BidItem()
            href['url'] = response.url
            href['title'] = response.css("h1.entry-title::text").extract_first()
            href['caption'] = response.css("p.wp-caption-text::text").extract()
            future_dir = href['url'].split("/")[-2]
            href['images'] = {link: future_dir}
            yield href
            yield scrapy. Request(urlparse.urljoin(http://www.example.com/',link),callback=self.parse_item)

pipelines.py

class otImagesPipeline(ImagesPipeline):

def get_media_requests(self, item, info):
        if 'images' in item:
            for image_url, img_dir in item['images'].items():
                request = scrapy. Request(url=image_url)
                request.meta['img_dir'] = img_dir
                yield request

def file_path(self, request, response=None, info=None):
       filename = request.url.split('/')[-1]
       filedir = request.meta['img_dir']
       filepath = filedir + "/" + filename
       return filepath

Solution

You have IMAGES_STORE in your Spider class, so you can access it later in the file_path method of the ImagesPipeline

class GetbidSpider(CrawlSpider):
    name = 'getbid'

IMAGE_DIR = 'C:\\temp\\images\\filenametest'

custom_settings = {
       "IMAGES_STORE": IMAGE_DIR
    }
    allowed_domains = ['example.com']
    start_urls = ['http://www.example.com']

rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

def parse_item(self, response):
        for sel in response.xpath('//a'):
          link = str(sel.xpath('@href').extract()[0])
          if (link.endswith('.jpg')):
            href = BidItem()
            href['url'] = response.url
            href['title'] = response.css("h1.entry-title::text").extract_first()
            href['caption'] = response.css("p.wp-caption-text::text").extract()

href['images'] = {link: href['title']}

yield href
            yield scrapy. Request(urlparse.urljoin('http://www.example.com/',link),callback=self.parse_item)

Then in your ImagesPipeline

class CustomImagePipeline(ImagesPipeline):

def get_media_requests(self, item, info):
        if 'images' in item:
            for image_url, img_name in item['images'].iteritems():

request = scrapy. Request(url=image_url)
                request.meta['img_name'] = img_name
                yield request

def file_path(self, request, response=None, info=None):
        return os.path.join(info.spider.IMAGE_DIR, request.meta['img_name'])

Related Problems and Solutions