2014-04-09 11 views
0

img 클래스에서 제목과 alt 값을 추출해야하는 HTML 파일입니다.BeautifulSoup 또는 Xpath를 사용하여 img 클래스에서 제목을 추출하는 방법

<div id="BVCustomerRatings" class="BVBrowserFF"> 
    <div class="BVRRRootElement"> 
     <div class="BVRRRatingSummary BVRRPrimaryRatingSummary"> 
      <div class="BVRRRatingSummaryStyle2"> 
       <div class="BVRRRatingSummaryHeader"></div> 
       <div class="BVRROverallRatingContainer"> 
        <div class="BVRRRatingContainerStar"> 
         <div class="BVRRRatingEntry BVRROdd"> 
          <div id="BVRRRatingOverall_" class="BVRRRating BVRRRatingNormal BVRRRatingOverall"> 
           <div class="BVRRLabel BVRRRatingNormalLabel"></div> 
           <div class="BVRRRatingNormalImage"> 
            <img class="BVImgOrSprite" width="75" height="15" **title="3.4 out of 5" alt="3.4 out of 5"** src="http://walmart.ugc.bazaarvoice.com/1336/3_4/5/rating.png"></img> 
           </div> 
           <div class="BVRRRatingNormalOutOf"></div> 
          </div> 
         </div> 
        </div> 
       </div> 

이 내 코드입니다!

from bs4 import BeautifulSoup, Tag 
import urllib2 
import re 


def complete_url(items_url): 
items="http://www.walmart.com"+items_url 
main_source=(urllib2.urlopen(items)).read() 
soup=BeautifulSoup(main_source) 
#Title=soup.find('h1',{"class":"productTitle"}).text.strip() 
#Price=soup.find('span',{"class":"bigPriceText1"}).text.strip()+soup.find('span',{"class":"smallPriceText1"}).text.strip() 
#Availability=soup.find('span',{"id":"STORE_AVAIL"}).text.strip() 
#Description=soup.find('span',{"class":"ql-details-short-desc"}).text.strip() 
images=soup.find('img',re.compile("bazaarvoice")) 
print images 


#print 'Title:%s,Price:%s,Availability:%s,Description:%s,Avg_Rating:%s' %(Title,Price,Availability,Description,Avg_Rating) 


def url_soup(url): 

source=(urllib2.urlopen(url)).read() 
soup=BeautifulSoup(source) 
link=soup.select('a.ListItemLink') 
for links in link: 
    item_links=(links['href']) 
link1=soup.find('a',href=True,text=re.compile("Next")) 
link2=soup.find('a',class_="SPPagNoLink jump next") 
complete_url(item_links) 
if link2 is None: 
    next_url=('http://www.walmart.com/search/search-ng.do'+re.sub(r'\s','',link1['href'])) 
    url_soup(next_url) 
else: 
    print "<<<<Last Page Reached>>>>" 


Dept={"All Departments":"0","Apparel":"5438","Auto":"91083","Baby":"5427","Beauty":"1085666", 
"Books":"3920","Electronics":"3944","Gifts":"1094765","Grocery":"976759","Health":"976760", 
"Home":"4044","Home Improvement":"1072864","Jwelery":"3891","Movies":"4096","Music":"4104", 
"Party":"2637","Patio":"5428","Pets":"5440","Pharmacy":"5431","Photo Center":"5426", 
"Sports":"4125","Toys":"4171","Video Games":"2636"} 


def gen_url(keyword,domain): 

if domain in Dept.keys(): 
    main_url=('http://www.walmart.com/search/search-ng.do?search_query='+'%s'+'&ic=16_0&Find=Find&search_constraint='+'%s') % (keyword,Dept.get(domain)) 
print main_url 
url_soup(main_url) 


gen_url('Laptop','All Departments') 

답변

1

매우 간단합니다. 내가 게시 한 것을 ...... 가 실제로 이미지의 많은 해당 페이지에있다가 내가 이전을 시도했지만 그 작동하지

In [1]: from bs4 import BeautifulSoup 
In [2]: html = ''' 
    ...: <div id="BVRRRatingOverall_" class="BVRRRating BVRRRatingNormal BVRRRatingOverall"> 
    ...:  <div class="BVRRLabel BVRRRatingNormalLabel"></div> 
    ...:   <div class="BVRRRatingNormalImage"> 
    ...:    <img class="BVImgOrSprite" width="75" height="15" title="3.4 out of 5" alt="3.4 out of 5" src="http://walmart.ugc.bazaarvoice.com/1336/3_4/5/rating.png"></img> 
    ...:   </div> 
    ...:  <div class="BVRRRatingNormalOutOf"></div> 
    ...: </div> 
    ...: ''' 
In [3]: soup = BeautifulSoup(html) 
In [4]: soup.find('img').get('title') 
Out[4]: '3.4 out of 5' 
In [5]: soup.find('img').get('alt') 
Out[5]: '3.4 out of 5' 
+0

: 당신은 태그의 속성의 값을 얻을 수 get()를 사용 HTML은 내가 필요로하는 것일 뿐이므로 특정 이미지 제목을 선택하는 것과 관련하여 구체적이어야합니다. 어떤 생각 ??? – user3488659

관련 문제