2015-01-29 2 views
-3

PHP를 사용하여 텍스트 파일에서 빈도 수와 함께 키워드를 추출해야합니다. 예를 들어 키워드 만 출력하는 코드를 발견했습니다. 일부, 텍스트, 기계, 판매. 나는 또한이 키워드들과 함께 주파수 카운트를 필요로한다. 약 3 개, 텍스트 2 개, 기계 1 개, 판매 1 개. 필요한 수정을 제안 할 수 있습니까?PHP로 키워드 추출

function extractCommonWords($string) 
{ 
     $stopWords = array('i','a','about','an','and','are','as','at','be','by','com','de','en','for','from','how','in','is','it','la','of','on','or','that','the','this','to','was','what','when','where','who','will','with','und','the','www'); 

     $string = preg_replace('/ss+/i', '', $string); 
     $string = trim($string); // trim the string 
     $string = preg_replace('/[^a-zA-Z0-9 -]/', '', $string); // only take alphanumerical characters, but keep the spaces and dashes too… 
     $string = strtolower($string); // make it lowercase 

     preg_match_all('/\b.*?\b/i', $string, $matchWords); 
     $matchWords = $matchWords[0]; 

     $totalWords = count($matchWords[0]); 

     foreach ($matchWords as $key=>$item) 
     { 
      if ($item == '' || in_array(strtolower($item), $stopWords) || strlen($item) <= 3) 
      { 
       unset($matchWords[$key]); 
      } 
     } 
     $wordCountArr = array(); 
     if (is_array($matchWords)) 
     { 
      foreach ($matchWords as $key => $val) 
      { 
       $val = strtolower($val); 
       if (!isset($wordCountArr[$val])) 
       { 
        $wordCountArr[$val] = array(); 
       } 
       if (isset($wordCountArr[$val]['count'])) 
       { 
        $wordCountArr[$val]['count']++; 
       } 
       else 
       { 
        $wordCountArr[$val]['count'] = 1; 
       } 
      } 
      arsort($wordCountArr); 

      $wordCountArr = array_slice($wordCountArr, 0, 10);  
      foreach ($wordCountArr as $key => $val) 
      { 
       $val['bytotal'] = $val['count']/$totalWords; 
      } 
     }  
     return $wordCountArr; 
} 
$text = "This is some text. This is some text. Vending Machines are great."; 
$words = extractCommonWords($text); 
echo implode(',', array_keys($words)); 

답변

0
function extractCommonWords($string) 
{ 
    $stopWords = array('i','a','about','an','and','are','as','at','be','by','com','de','en','for','from','how','in','is','it','la','of','on','or','that','the','this','to','was','what','when','where','who','will','with','und','the','www'); 

    $string = preg_replace('/ss+/i', '', $string); 
    $string = trim($string); 
    $string = preg_replace('/[^a-zA-Z0-9 -]/', '', $string); // only take alphanumerical characters, but keep the spaces and dashes too… 
    $string = strtolower($string); // make it lowercase 
    echo $string."<br>"; 

    preg_match_all('/\b.*?\b/i', $string, $matchWords); 
    $matchWords = $matchWords[0]; 
    $totalWords = count($matchWords[0]); 

    foreach ($matchWords as $key=>$item){ 
     if ($item == '' || in_array(strtolower($item), $stopWords) || strlen($item) <= 3) { 
      unset($matchWords[$key]); 
     } 
    } 

    $wordCountArr = array(); 
    if (is_array($matchWords)) { 
     foreach ($matchWords as $key => $val) { 
      $val = strtolower($val); 
      if (isset($wordCountArr[$val])){ 
       $wordCountArr[$val] += 1; 
      } else { 
       $wordCountArr[$val] = 1; 
      } 
     } 
     arsort($wordCountArr); 
    } 
} 

$text = "This is some text. This is some text. Vending Machines are great."; 
$words = extractCommonWords($text); 
foreach ($words as $word => $count){ 
    print ($word . " was found " . $count . " time(s)<br> "); 
} 
관련 문제