BeautifulSoup/Requests로 URL을 긁어 내고 필요한 섹션 만 꺼내서 제거하십시오. 다른 대상 URL을 결정한 후에 HTML을 올바르게 출력하지만 코드를 정리하는 코드가 작동하지 않습니다. 여기 내 코드는 다음과 같습니다.BeautifulSoup을 이용한 Python 스크래핑/파싱
import requests
from bs4 import BeautifulSoup
import bs4.element
import pprint
def connection(url):
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_5)'}
r = requests.get(url,headers=headers)
soup = BeautifulSoup(r.text)
return soup
def scrape_metacritic(soup,movie_list=[]):
for mlist in get_modules(soup).items():
for movie in mlist:
try:
m = parse_movie_li(movie)
except:
continue
#m['release_type']=release_type
movie_list.append(m)
return movie_list
def just_tags(templist):
tags = [t for t in templist if isinstance(t,bs4.element.Tag)]
return tags
def get_modules(soup):
module = soup.find(class_='body_wrap') #body_wrap
module_dict = {}
for mod in module.find_all('li', class_='product'):
movie_lis = just_tags(mod.find('ul').contents)
module_dict[mod]=movie_lis
return module_dict
get_modules(soup)
그 부분이 작동합니다.
{<li class="product limited_release_product has_small_image"><div class="wrap product_wrap"><div class="product_basics stats"><div class="basic_stats has_score"><div class="main_stats"><div class="basic_stat product_title"><h3 class="product_title"><a href="/movie/a-birders-guide-to-everything">A Birder's Guide to Everything</a></h3></div><a class="basic_stat product_score" href="/movie/a-birders-guide-to-everything">
<span class="metascore_w medium movie positive">61</span>
</a></div> <div class="more_stats extended_stats">
<ul class="more_stats">
<li class="stat release_date">
<span class="label">Release Date:</span>
<span class="data">March 21, 2014</span>
</li>
<li class="stat rating">
<span class="label">Rated:</span>
<span class="data">
..
<span class="data">136 min</span>
</li>]}
은 지금이 그것을 청소하려고 :
from dateutil import parser
def parse_movie_li(li):
title_div = li.find(class_='product_title')
movie = {
'title':title_div.text.strip(),
'rel_url':title_div.find('a')['href'],
'release_date':get_release_date(li.find(class_='release_date').find(class_='data')),
'metascore_w':get_metascore_w(li.find(class_='metascore_w')),
'user_score':get_user_score(li.find(class_='product avg_userscore').find(class_='data')), #add func
'genre':get_genre(li.find(class_='genre').find(class_='data')), #add func
'star_cast':get_star_cast(li.find(class_='cast').find(class_='data')), #add func
'runtime':get_runtime(li.find(class_='runtime').find(class_='data')) #add func
}
#print movie,'\n'
return movie
def get_metascore_w(div):
try:
score = div.text
except:
print 'no text in metascore div'
return None
try:
score = int(score)
except:
pass
return score
def get_release_date(div):
try:
datestr = div.text
except:
return None
try:
date = parser.parse(datestr)
except:
return datestr
return date
def get_user_score(div):
try:
uscore = div.text
except:
print 'no text in userscore div'
return None
try:
uscore = int(uscore)
except:
pass
return uscore
def get_genre(div):
try:
genre = div.text
except:
print 'no text in genre div'
return None
try:
genre
except:
pass
return score
def get_star_cast(div):
try:
cast = div.text
except:
print 'no text in cast div'
return None
try:
cast
except:
pass
return cast
def get_runtime(div):
try:
runtime = div.text.strip(' min')
except:
print 'no text in runtime div'
return None
try:
runtime = int(runtime)
except:
pass
return runtime
그것은해야
url = 'http://www.metacritic.com/browse/movies/title/dvd/a?view=detailed'
soup = connection(url)
이 내가 스크랩 한 후지고있어 무엇의 일부는 다음과 같습니다 URL은 이 형식으로 출력하십시오 :
[{'metascore_w': 28,
'rel_url': '/movie/mortdecai',
'release_date': datetime.datetime(2015, 1, 23, 0, 0),
'release_type': u'Wide releases now in theaters',
'title': u'Mortdecai'},
{'metascore_w': 24,
'rel_url': '/movie/strange-magic',
'release_date': datetime.datetime(2015, 1, 23, 0, 0),
'release_type': u'Wide releases now in theaters',
'title': u'Strange Magic'},
..
{'metascore_w': u'tbd',
'rel_url': '/movie/20-once-again',
'release_date': datetime.datetime(2015, 1, 16, 0, 0),
'release_type': u'Limited releases now in theaters',
'title': u'20 Once Again'}]
Howeve r, 나는 이것을 얻고있다 :
{<li class="product limited_release_product has_small_image alt"><div class="wrap product_wrap"><div class="product_basics stats"><div class="basic_stats has_score"><div class="main_stats"><div class="basic_stat product_title"><h3 class="product_title"><a href="/movie/a-family-thing">A Family Thing</a></h3></div><a class="basic_stat product_score" href="/movie/a-family-thing">
<span class="metascore_w medium movie positive">71</span>
</a></div> <div class="more_stats extended_stats">
<ul class="more_stats">
<li class="stat release_date">
<span class="label">Release Date:</span>
<span class="data">March 29, 1996</span>
</li>..
이것은 분석되지 않는다. parse_movie_li 기능을 사용하여 내가 잘못하고있는 것에 대한 지침이 있습니까?
'just_tags() '란 무엇입니까? – alecxe
2 개의 기능을 추가하는 것을 잊었습니다 .. thx @alexce – DNburtonguster