지난 몇 시간 동안 최선을 다 했음에도 불구하고 크롤러 (searchengine.py
)를 실행할 수 없습니다. 그것은 페이지가 성공적으로 색인을 생성 할 수없는 것 같습니다. 전체 크롤러 코드를 알려 드리겠습니다. 오류의 종류는 내가Python 웹 크롤러를 디버깅하는 데 도움이 필요합니다.
Indexing http://www.4futureengineers.com/company.html
Could not parse page http://www.4futureengineers.com/company.html
아래 내 파이썬 대화 형 세션 (쉘)에서 다음 명령을 입력하여 searchengine.py
를 호출하고 같은 외모를 수신하고 있습니다. 그것은 오류를주고
>> import searchengine
>> crawler=searchengine.crawler('searchindex.db')
>> pages= \
.. ['http://www.4futureengineers.com/company.html']
>> crawler.crawl(pages)
바로 명령 여기 crawler.crawl(pages)
searchengine.py의 완전한 소스 코드 후 실패 구문 분석, 즉
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
from pysqlite2 import dbapi2 as sqlite
# Create a list of words to ignore
ignorewords={'the':1,'of':1,'to':1,'and':1,'a':1,'in':1,'is':1,'it':1}
class crawler:
# Initialize the crawler with the name of database
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
# Auxilliary function for getting an entry id and adding
# it if it's not present
def getentryid(self,table,field,value,createnew=True):
cur=self.con.execute(
"select rowid from %s where %s='%s'" % (table,field,value))
res=cur.fetchone()
if res==None:
cur=self.con.execute(
"insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
# Index an individual page
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing '+url
# Get the individual words
text=self.gettextonly(soup)
words=self.separatewords(text)
# Get the URL id
urlid=self.getentryid('urllist','url',url)
# Link each word to this url
for i in range(len(words)):
word=words[i]
if word in ignorewords: continue
wordid=self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))
# Extract the text from an HTML page (no tags)
def gettextonly(self,soup):
v=soup.string
if v==Null:
c=soup.contents
resulttext=''
for t in c:
subtext=self.gettextonly(t)
resulttext+=subtext+'\n'
return resulttext
else:
return v.strip()
# Seperate the words by any non-whitespace character
def separatewords(self,text):
splitter=re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!='']
def isindexed(self,url):
u=self.con.execute \
("select rowid from urllist where url='%s'" % url).fetchone()
if u!=None:
#Check if it has actually been crawled
v=self.con.execute(
'select * from wordlocation where urlid=%d' % u[0]).fetchone()
if v!=None: return True
return False
def crawl(self,pages,depth=2):
for i in range(depth):
newpages={}
for page in pages:
try:
c=urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
try:
soup=BeautifulSoup(c.read())
self.addtoindex(page,soup)
links=soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url=urljoin(page,link['href'])
if url.find("'")!=-1: continue
url=url.split('#')[0] # remove location portion
if url[0:4]=='http' and not self.isindexed(url):
newpages[url]=1
linkText=self.gettextonly(link)
self.addlinkref(page,url,linkText)
self.dbcommit()
except:
print "Could not parse page %s" % page
pages=newpages
# Create the database tables
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer,toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index wordurlidx on wordlocation(wordid)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
노트마다 시도했지만 지금까지는 운이 없습니다. – anayb
"지금까지 행운이 없다"는 것은별로 도움이되지 않습니다. 너 정확히 뭐 했어? 마지막 단락을 따르는 경우 더 구체적인 오류 추적을 받아야합니다. 그들은 무엇인가? – jonrsharpe