2014-05-14 2 views
1

일부 전자 상거래 사이트에서 이미지 및 콘텐츠를 가져 오는 경우 scrapy spider이 있습니다. 지금은 내가 몇 가지 코드를 작성, 이미지를 다운로드 할하지만 난이 오류가있어 :치료 이미지 다운로드시 오류가 발생했습니다.

.. 

      File "/usr/lib/python2.7/pprint.py", line 238, in format 
      return _safe_repr(object, context, maxlevels, level) 
      File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr 
      vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) 
      File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr 
      rep = repr(object) 
      File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__ 
      return pformat(dict(self)) 
      File "/usr/lib/python2.7/pprint.py", line 63, in pformat 
      return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object) 
      File "/usr/lib/python2.7/pprint.py", line 122, in pformat 
      self._format(object, sio, 0, 0, {}, 0) 
      File "/usr/lib/python2.7/pprint.py", line 140, in _format 
      rep = self._repr(object, context, level - 1) 
      File "/usr/lib/python2.7/pprint.py", line 226, in _repr 
      self._depth, level) 
      File "/usr/lib/python2.7/pprint.py", line 238, in format 
      return _safe_repr(object, context, maxlevels, level) 
      File "/usr/lib/python2.7/pprint.py", line 282, in _safe_repr 
      vrepr, vreadable, vrecur = saferepr(v, context, maxlevels, level) 
      File "/usr/lib/python2.7/pprint.py", line 323, in _safe_repr 
      rep = repr(object) 
      File "/usr/local/lib/python2.7/dist-packages/Scrapy-0.23.0-py2.7.egg/scrapy/item.py", line 77, in __repr__ 
      return pformat(dict(self)) 
      File "/usr/lib/python2.7/pprint.py", line 63, in pformat 
      return PrettyPrinter(indent=indent, width=width, depth=depth).pformat(object) 
      File "/usr/lib/python2.7/pprint.py", line 122, in pformat 
      self._format(object, sio, 0, 0, {}, 0) 
      File "/usr/lib/python2.7/pprint.py", line 140, in _format 
      rep = self._repr(object, context, level - 1) 
      File "/usr/lib/python2.7/pprint.py", line 226, in _repr 
      self._depth, level) 
      File "/usr/lib/python2.7/pprint.py", line 238, in format 
      return _safe_repr(object, context, maxlevels, level) 
      File "/usr/lib/python2.7/pprint.py", line 280, in _safe_repr 
      for k, v in _sorted(object.items()): 
      File "/usr/lib/python2.7/pprint.py", line 78, in _sorted 
      with warnings.catch_warnings(): 
     exceptions.RuntimeError: maximum recursion depth exceeded 

spider :

from scrapy.spider import Spider 
from scrapy.selector import Selector 
from scrapy.http import Request 

from loom.items import LoomItem 
import sys 


from scrapy.contrib.loader import XPathItemLoader 

from scrapy.utils.response import get_base_url 
from scrapy.contrib.spiders import CrawlSpider, Rule 
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 

class LoomSpider(CrawlSpider): 
    name = "loom_org" 
    allowed_domains = ["2loom.com"] 
    start_urls = [ 
     "http://2loom.com", 
     "http://2loom.com/collections/basic", 
     "http://2loom.com/collections/design", 
     "http://2loom.com/collections/tum-koleksiyon" 
    ] 

    rules = [ 
      Rule(SgmlLinkExtractor(allow='products'), callback='parse_items',follow = True), 
      Rule(SgmlLinkExtractor(allow=()), follow=True), 
     ]  

    def parse_items(self, response): 
     sys.setrecursionlimit(10000)   

     item = LoomItem() 

     items = [] 
     sel = Selector(response) 
     name = sel.xpath('//h1[@itemprop="name"]/text()').extract() 
     brand = "2loom" 
     price_lower = sel.xpath('//h1[@class="product-price"]/text()').extract() 
     price = "0" 
     image = sel.xpath('//meta[@property="og:image"]/@content').extract() 
     description = sel.xpath('//meta[@property="og:description"]/@content').extract() 

     print image 

     ##image indiriliyor 

     loader = XPathItemLoader(item, response = response) 
     loader.add_xpath('image_urls', '//meta[@property="og:image"]/@content')  


     ##ID Split ediliyor (10. Design | Siyah & beyaz kalpli) 

     id = name[0].strip().split(". ") 
     id = id[0] 

     item['id'] = id 
     item['name'] = name 
     item['url'] = response.url 
     item['image'] = loader.load_item() 
     item['category'] = "Basic" 
     item['description'] = description 
     item["brand"] = "2Loom" 
     item['price'] = price 
     item['price_lower'] = price_lower 


     print item 


     items.append(item) 
     return items 


Items 

# Define here the models for your scraped items 
# 
# See documentation in: 
# http://doc.scrapy.org/en/latest/topics/items.html 

from scrapy.item import Item, Field 

class LoomItem(Item): 
    # define the fields for your item here like: 
    # name = Field() 

    id = Field() 
    name = Field() 
    brand = Field() 
    image = Field() 
    category = Field() 
    description = Field() 
    price_lower = Field() 
    price = Field() 
    url = Field() 
    images = Field() 
    image_urls = Field()  

Pipeline :

from scrapy.contrib.pipeline.images import ImagesPipeline, ImageException 
from scrapy.http import Request 
from cStringIO import StringIO 
import psycopg2 
import hashlib 
from scrapy.conf import settings 

class MyImagePipeline(ImagesPipeline): 
    def get_media_requests(self, item, info): 
     return [Request(x) for x in item.get('image_urls', [])] 

    def item_completed(self, results, item, info): 
     item['images'] = [x for ok, x in results if ok] 
     return item 

    # Override the convert_image method to disable image conversion  
    def convert_image(self, image, size=None): 
     buf = StringIO()   
     try: 
      image.save(buf, image.format) 
     except Exception, ex: 
      raise ImageException("Cannot process image. Error: %s" % ex) 

     return image, buf  

    def image_key(self, url): 
     image_guid = hashlib.sha1(url).hexdigest() 
     return 'full/%s.jpg' % (image_guid) 

Settings :

BOT_NAME = 'loom' 

SPIDER_MODULES = ['loom.spiders'] 
NEWSPIDER_MODULE = 'loom.spiders' 


DOWNLOAD_DELAY  = 5 

ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1} 
IMAGES_STORE = '/root/loom/images/' 

IMAGES_THUMBS = { 
    'small': (90, 90), 
    'big': (300, 300), 
} 

USER_AGENT  = "Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0" 
IM_MODULE  = 'loom.pipelines.MyImagePipeline' 
ITEM_PIPELINES = ['loom.pipelines.MyImagePipeline'] 



LOG_LEVEL = 'INFO' 

왜이 오류가 발생하는지 알지 못합니다. 덕분에 도와 드리겠습니다.

답변

1

스파이더에서 sys.setrecursionlimit(10000)에 대한 재귀 제한을 변경해보십시오. 파이썬 인터프리터는 "RuntimeError"이전에 900 회 재귀를했다.

관련 문제