0
여기 내 웹 치료의 간단한 구조가 있습니다.치료에서 start_urls를 만드는 과정을 마무리하는 방법은 무엇입니까?
import scrapy,urllib.request
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
name = "quotes"
allowed_domains = ["finance.yahoo.com"]
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item:namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol :
s3.append(symbol)
start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s2]
def parse(self, response):
content = response.body
target = response.url
#doing somthing ,omitted code
test.py로 저장하고 scrapy runspider test.py
으로 실행하려면 다음과 같이하십시오.
이제 start_urls를 만드는 모든 코드를 래핑하고 싶습니다.
여기 내 시도.
class TestSpider(scrapy.Spider):
def __init__(self, *args, **kw):
self.timeout = 10
url_nasdaq = "ftp://ftp.nasdaqtrader.com/SymbolDirectory/nasdaqlisted.txt"
s = urllib.request.urlopen(url_nasdaq).read().decode('ascii')
s1 = s.split('\r\n')[1:-2]
namelist = []
for item in s1:
if "NASDAQ TEST STOCK" not in item : namelist.append(item)
s2 = [s.split('|')[0] for s in namelist]
s3=[]
for symbol in s2:
if "." not in symbol : s3.append(symbol)
self.start_urls = ["https://finance.yahoo.com/quote/"+s+"/financials?p="+s for s in s3]
작동하지 않습니다.