2016-08-29 5 views
0

FormRequest를 사용하여 웹 사이트에 로그인하기 위해 scrapy를 사용하고 있지만 사이트가 ASP.Net에 내장되어 있으므로 페이지 매김을 위해 FormRequest도 사용해야하지만 코드는 제대로 실행되지만 ' 내가 새로운 치료법을 사용하기 때문에 페이지 매김을 위해 구성하는 방법을 안다. parse_item()에서 return을 사용하면 작동하지만 paraginit()에서 yield를 사용하면이 오류가 발생합니다.여러 formrequest에 대한 Scrapy 오류

2016-08-29 20:44:59 [scrapy] ERROR: Spider must return Request, BaseItem, dict or None, got 'list' in <GET https://recruiter.cwjobs.co.uk/Recruitment/CandidateSearch/CandidateSearchResults.aspx?SalaryRateTypeId=1&SalaryRangeIds=17%2c18%2c19%2c20%2c21%2c22%2c23%2c24%2c25%2c26&LastActivityId=15&Radius=-1&JobTypeProfile=20&LimitSearch=True&CandidateSearchCriteria=vKYIkjLZq5Af6OEmkANngg%3d%3d&scr=1&iasc=0> 

이것은

import scrapy 
from scrapy.http import Request 
from scrapy.http import FormRequest 
from cwjobs.items import CwjobsItem 


class RecruiterSpider(scrapy.Spider): 
    name = "recruiter" 
    allowed_domains = ["recruiter.cwjobs.co.uk"] 
    start_urls = (
     'https://recruiter.cwjobs.co.uk/loginReturnUrl=%2fhome%3fRHP%3dnav_bar_SignIn/',) 

    def start_requests(self): 
     return [FormRequest("https://recruiter.cwjobs.co.uk/login/",formdata={"__EVENTTARGET":"","__EVENTARGUMENT":"","__VIEWSTATE":"QI2hCUmnX2GZ+vtA2RoynX1rSOZ0LG+0ixQlSPqGcTM9qCheVZwbfaMtPeQAfiQCmM/aJhVjQ7bljYbGfVUEhzVsDaNRB+3qBuOc+SYZ+pHoSk2s0cFz6f5ODgqv/6Jj12bUs7OKnyIa8mlPo+xfmhS+oWroHnJyfPvBAGZkInpW5EcmmKqHD2Ede0XdsH2mMM4nPIy+PRsGW1ZeVd6HifZC1RG9bFXlunoIlQDNhDQeOpRmVdcRroybtCCp+1jLrH4EOGKfOCQ+o2WFGBfldPfS1AHGXL9tDHwvrol4Cx/nK01y1E27PWobQ2RlUXINMBNditfn3qTKCKlGRSLHMJ+PpfZJv1ncmNTvtV+kR1O5vTLbw03Ct3HMzw4GI/zmwojQqUXa0Z4vAoe6bqkzZpm1qKtzzsdpsp5uLTaGiv3SAlDXrK/vuvCFGMqZTMAoqJ47WluyIFsA3Y4dak69mF/UMH3+Foizgh+37IHrL6hM2v17NyvfMAgJXncASJ6P85t8R3Xr2Q4Z1kEbKna1Qi4yINI+wrSmZSSdcTnw3oiklUBCATmFbbnPdhNbr9AIK3lm7hu8OxrXRDRjsOulpB5BgS0Xu4O/8G0A4UNWlLGFoaNdOa/P8UZFvTiRL0uZJR1bL9QImr7DT5ChOPPh4Xzf4KdmB/L7/gRiQlhxQ6ek4BxcjruN3sZ6eFNrEAAbFGMuxevrFlBM+FFvwHEOEK03pYtBjrDhGTVeujLJO7TCetqUZ7+PVGs17by20kkOEMvOFKx9mTeW4oFzbqAUQvQjhW+hSEVmNvRzw+lhov0v1OUcrTdGL6C6sk9jKUALgiyOWEabMSGqoWA5eVQEyiFXVuAQ5AJZcKeQ13wDGZ1HFXj/dlE+jA6p0E/FfEc+A5T69bTN6zjvCwkew/DxJxmxBBBxxnMhgbn8qnpbVRkJj9cg/uTJoD7zI7WWnUTK9neMdPCLGa1MPvXNV/YkCGgswrGqKk9B4eWdGQHqhJJj+Fgb7uW1ZycnuyBoHup8rpKEx1wz56voovTuVRBFk60CHv8MDMcmqAbXujUGwKgZCraUVtAgV10eTG8emVCGGAE5LOkl8eo1h7iV/VWZieE3H+VgD7hucFv2Ny7pzqrxZ68xZEu2F7MQgKL92uKGsNyrHcjTwtCcorYoTIXTGOAlZo3FA5LXL2XFAmVCHH3smh0r2yQyQitQ7oaqVX2jgTL4HdXTVny9Qf5pdkDlHneSCmkMVN45ILhmpTWKj27kpSK/QlYvoG+cvKKdXW2wWJ5ZZ2sqHqH4lWNVmgARYG8JDIXLNRRHv+S5MBGg0hQ6llYrparx6azMop5cx3AeMssimtPJvl+FvcNyqpAZpMsiXEpTBWlHUHdyO3PCq8yYpE4SoOn7NmiVqDE69c2z1/pHlH0fQDUsa7UsKHOAHtyznX0E29q8r0zNJEpNhUH/uX/6G7syXljeOB0P1XVTRbZmL8mFBCMxMPCt/vFi+MPKgr2aPlT7RPv+yy4bILRavikMOKFJ+Cf2Q3r0J60feH+bKISzib9VPvfdj2qudb0Ctt7XbTi0vWKmikStwMwZiVlZlpHImSgmokCC7T988NFHhGw+84Kxc7r8CyBTdfqC2flZpCM5VqY1q1kw/YklVnsm0Uv2FBT0gy4kAQxgOw9D4aA3Ahqr7dWiDDiGPc5/U/ci3D6v9pbbCg3rOGAI4zEUFli0n3OPjEIwCzRi3KVkgSenZjGcTNEtA/sqL8WuMzxv9dIporx76Iwxy6D8wPbWogn30WcHfqR2VWoPvH4Q1fz/4a1hnY9P6N8Y3AEVKrc9fnRaQu/LNQAQajqU5PqLAVmZgbJo4w8M839nQk+nxO+vkidRxU0hONe7dgADn9mqYf4ss0ITvzEvoLdFv9DdjcBVXh/ZxFZZeVZAZ0B+bXQ3Sf7oMEmZSL0rBxq47EG1MDLksHnQZF0VbOPsdsJpKK770zbcAe4yLgVRye6RGxObQfOWaJVGhZXjMnk8+HEspMLLLj3jUKPkHMUbK7mvjWs3A2o0Z4g=","__VIEWSTATEGENERATOR":"607E8F97","LoginPanel$txtUsername":"*******","LoginPanel$txtPassword":"*********","LoginPanel$btnSubmit":"Sign in","Register$txtFirstName":"","Register$txtLastName":"","Register$txtCompanyName":"","Register$txtBusinessPhone":"","Register$txtEmailAddress":"","Register$txtPassword":"","Register$txtPasswordConfirm":"","Register$txtCharityNumber":"","txtReminderUsername":""})] 

    def parse(self,response): 
     print response.xpath("//h1[@class='account-name']/text()").extract() 
     return Request("https://recruiter.cwjobs.co.uk/Recruitment/CandidateSearch/CandidateSearchResults.aspx?SalaryRateTypeId=1&SalaryRangeIds=17%2c18%2c19%2c20%2c21%2c22%2c23%2c24%2c25%2c26&LastActivityId=15&Radius=-1&JobTypeProfile=20&LimitSearch=True&CandidateSearchCriteria=vKYIkjLZq5Af6OEmkANngg%3d%3d&scr=1&iasc=0", callback = self.parse_item) 

    def parse_item(self, response): 
     candsearch = response.xpath("//input[@id='CandidateSearchResults']/@value").extract()[0] 
     viewsgenerator = response.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value").extract()[0] 
     print viewsgenerator 
     newsearch = response.xpath("//input[@id='NewSearchCriteria']/@value").extract()[0] 
     searchcriteria = response.xpath("//input[@id='CandidateSearchCriteria']/@value").extract()[0] 
     viewstate = response.xpath("//input[@id='__VIEWSTATE']/@value").extract()[0] 

     for i in range(1, 3): 
      print i 
      data = {"__EVENTTARGET":"ctl00$cphCentralPanel$ucSearchResults$pgrPager","__EVENTARGUMENT":str(i),"CandidateSearchCriteria":searchcriteria,"NewSearchCriteria":newsearch,"Keywords":"","CandidateSearchResults":candsearch,"__LASTFOCUS":"","__VIEWSTATE":viewstate,"__VIEWSTATEGENERATOR":viewsgenerator,"ctl00$cphCentralPanel$NewOrExistingSavedSearch":"rdoNewSavedSearch", "ctl00$cphCentralPanel$txtSavedSearchName":"","ctl00$cphCentralPanel$ucSearchResults$hdnPopoverLinkClicked":"","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$txtBoolean":"","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$hdnIsAutosuggestChosen":"0","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$searchTypePart$qsSearchType":"rbProfileAndCV", "ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$txtPostcode":"","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$ddlRadius":"-1", "ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$qsLoc":"rdoPostcode","ctl00$cphCentralPanel$ucSearchResults$ucFacetedSearch$ddlLastActivity":"15","ctl00$cphCentralPanel$ucSearchResults$ddlSort":"Relevancy#0", "ctl00$cphCentralPanel$ucSearchResults$ddlPageSize":"50"} 
      request = [FormRequest.from_response(response, formdata = data, callback = self.parse_page2)] 
      yield request 

    def parse_page2(self,response): 
     li = response.xpath("//div[@class = 'row card-row']") 
     for l in li: 
      item = CwjobsItem() 
      firstname = l.xpath(".//a[@class='candidate-lnk']//span[@class='firstName']/text()").extract() 
      lastname = l.xpath(".//a[@class='candidate-lnk']//span[@class='lastName']/text()").extract() 
      item['name'] = firstname + lastname 
      det = l.xpath(".//div[@id='current-expected-row']") 
      for d in det: 
       currs = d.xpath(".//li[contains(@id, 'CurrentSalary')]/span/text()").extract() 
       if currs: 
        item['currs'] = currs[0].strip() 
       currjobt = d.xpath(".//li[contains(@id, 'CurrentJobTitle')]/span/text()").extract() 
       if currjobt: 
        item['currjobt'] = currjobt[0].strip() 
       Experience = d.xpath(".//li[contains(@id, 'Experience')]/span/text()").extract() 
       if Experience: 
        item['Experience'] = Experience[0].strip() 
       Desiredjob = d.xpath(".//li[contains(@id, 'DesiredJobTitle')]/span/text()").extract() 
       if Desiredjob: 
        item['Desiredjob'] = Desiredjob[0].strip() 
       Desireds = d.xpath(".//li[contains(@id, 'DesiredSalary')]/span/text()").extract() 
       if Desireds: 
        item['Desireds'] = Desireds[0].strip() 
       DesiredLoc = d.xpath(".//li[contains(@id, 'DesiredLocations')]/span/text()").extract() 
       if DesiredLoc: 
        item['DesiredLoc'] = DesiredLoc[0].strip() 
       phone = l.xpath("//span[@class='action-span hiddendata']/@data-hiddendataurl").extract() 
       if phone: 
        item['phonel'] = "https://recruiter.cwjobs.co.uk"+ phone[0] 
       cvl = l.xpath("//a[@class='action-link view-cv-icon cv-action-button']/@href").extract() 
       if cvl: 
        item['cvl'] = "https://recruiter.cwjobs.co.uk"+ cvl[0] 
       emaillink = l.xpath("//a[@class='hiddendata action-link email-candidate']/@data-hiddendataurl").extract() 

       if emaillink: 
        emaillink = "https://recruiter.cwjobs.co.uk" + emaillink[0] 
        item['email'] = emaillink 
        # request.meta['item'] = item 
        # yield request 
        # return 

       # yield Request(item['cvl']) 
       # item['email'] = [response.body] 
       return item 
    # def parse_page(self,response): 
    # # item = response.meta['item'] 
    # item['email'] = response.body 
    # yield item 

가 어떻게이 문제를 해결할 수

내 코드?

+0

대부분의 경우 FormRequest에 일부 데이터가 누락되었다는 의미입니다. – Granitosaurus

+0

나는 각 자료를 인쇄했다, 각 가치는 존재했다. 그래도 여전히 오류가 표시됩니다. –

+0

안녕하세요, Granitos, 질문을 수정했습니다. 다시 검토해 주시겠습니까 –

답변

0

치료 중 하나가 scrapy.Item 또는 scrapy.Request 일 때 목록을 반환합니다.

범인 라인 :

request = [FormRequest.from_response(response, formdata = data, callback = self.parse_page2)] 
return request 

는 목록에 요청을하거나 반복해서는 안 중 하나를이 문제를 해결하고 각 요소를 얻었다.

request = FormRequest.from_response(response, formdata = data, callback = self.parse_page2) 
return request 
# or 
requests = [FormRequest.from_response(response, formdata = data, callback = self.parse_page2)] 
for r in requests: 
    yield r 
+0

페이지 매김이 아직 작동하지 않습니다. 결과는 50 후 반복됩니다. –

+0

@ GokuFrieza이 대답은 오류와 관련된 질문에 대한 답변입니다. 페이지 매김 문제의 경우 매우 구체적인 사례이므로 formdata에 뭔가 빠져있을 가능성이 있으므로 모든 formdata 값을 하드 코딩해야합니다. 실제로는 정적 값이 거의 없으므로 페이지 소스에서 해당 값을 검색해야합니다. 모든 페이지마다 다름). 네트워크 관리자를 사용하여 전체 프로세스를 리버스 엔지니어링하고 asp.net에서 예상하는 것과 동일한 데이터를 보내야합니다. – Granitosaurus

+0

문제는 "FormRequest.from_response (응답", 원시 URL을 사용하는 경우 페이지 매김을 얻을 수 있지만 치료 셸에서만 사용), from_response를 사용하면 페이지 매김을 얻지 못합니다. –