2011-04-28 5 views
0

먼저 비교하고 내가 할 노력하고있어,크롤링 페이지 텍스트

왔어요 :)을 지적 주시기 바랍니다 일을 더 이상 효율적인 방법이 있다면 무엇보다도 나는, 그래서 PHP 초보자 오전 사이트를 크롤링하고 찾은 페이지에서 응답 코드를 확인하는 데 사용 된 이전 PHP 스크립트에서 중복 된 콘텐츠 확인을 수행하도록 수정했습니다. similar_text 함수를 사용하여 1 페이지의 내용 (사용자가 지정한 내용)과 찾은 각 페이지의 내용을 비교합니다.

다소 느리지 만 작동합니다. 내가 가지고있는 유일한 문제는 처음 10 개의 링크가 끝나면 멈추고 그 이유를 알 수 없다는 것입니다. 나는 사전에 사과한다, 나는 꽤 약간의 코드라는 것을 안다. 어떤 도움이라도 대단히 감사합니다.

<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">  
<div class="row"><label for="page1" class="small label"><strong>Page? </strong>: </label><input type="text" name="page1" id="page1" value="" size="40" /></div>   
<div class="row"><label for="url" class="small label"><strong>Please Enter URL </strong>: </label><input type="text" name="url" id="url" value="" size="40" /></div> 
<div class="row"><label for="maxlinks" class="small label"><strong>Number of links to get </strong>: </label><input type="text" name="maxlinks" id="maxlinks" value="25" size="3" maxlength="3" /></div> 
<div class="row"><label for="linkdepth" class="small label"><strong>Links Maximum depth</strong> : </label> <select name="linkdepth" id="linkdepth" ><option value="1">1</option> 
<option value="2" selected="selected">2</option> 
<option value="3">3</option> 
<option value="4">4</option> 
<option value="5">5</option> 
<option value="6">6</option> 
</select></div> 
<input type="submit" name="submit" style="font-weight: bold" value="Check links" id="submit" /> 
</form> 
<?php 
if (isset($_POST['submit'])){ 
    $page1 = ($_POST['page1']); 
    $baseurl = ($_POST['url']); 
    $pages = array(); 
    $i=($_POST['linkdepth']); 
    $maxlinks = (integer)$_POST['maxlinks']; 

$domain= extract_domain_name($baseurl); 
echo '<p class="small">Extracted domain name: <strong>'.$domain.'</strong>. '; 
echo 'Maximum depth: <strong>'.$i.'</strong></p>'; 
function get_urls($page){ 
    global $domain, $i; 

    $ch = curl_init(); 
    curl_setopt($ch, CURLOPT_URL, $page); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
    curl_setopt($ch, CURLOPT_HEADER, true); 
    /* Spoof the User-Agent header value; just to be safe */ 
    curl_setopt($ch, CURLOPT_USERAGENT, 
     'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'); 
    /* I set timeout values for the connection and download 
    because I don't want my script to get stuck 
    downloading huge files or trying to connect to 
    a nonresponsive server. These are optional. */ 
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100); 
    curl_setopt($ch, CURLOPT_TIMEOUT, 100); 
    /* This ensures 404 Not Found (and similar) will be 
    treated as errors */ 
    curl_setopt($ch, CURLOPT_FAILONERROR, 0); 

    /* Download the page */ 
    $html = curl_exec($ch); 
    /* in case of an error*/ 
    if(curl_exec($ch) === false) 
     { 
     echo '<p class="small">Error. Please check URL: <strong style="color:#ae3100">' . curl_error($ch).'</p></strong>'; 
     } 

    curl_close($ch); 

    if(!$html) return false; 
    /* Extract the BASE tag (if present) for 
     relative-to-absolute URL conversions later */ 
     if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){ 

     $base_url=$matches[1]; 
     echo $base_url; 
      } else { 
        $base_url=$page; //base url = strani4ka s kotoroy na4inaetsa novaja porverka 
        } 
      $links=array(); 
      $html = str_replace("\n", ' ', $html); 


      preg_match_all('/<a[\s]+[^>]*href\s*=\s*[\"\']?([^\'\" >]+)[\'\" >]/i', $html, $m); 
     /* this regexp is a combination of numerous 
      versions I saw online*/ 
       foreach($m[1] as $url) { 
       $url=trim($url); 
       /* get rid of PHPSESSID, #linkname, & and javascript: */ 
       $url=preg_replace(
        array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i'), 
        array('','','&',''), 
        $url); 

       /* turn relative URLs into absolute URLs. 
        relative2absolute() is defined further down 
        below on this page. */ 

        $url = relative2absolute($base_url, $url); 

        // check if in the same (sub-)$domain 
       if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url)) 
       { 
       $depth= substr_count($url, "/")-2 ; 

       /* Counts slashes in URL 
       Responsible for link depth 
       */ 

     if ($depth <= $i){ 

      if(!in_array($url, $links, check)) $links[]=$url; 
       } } 
     } 

    return $links; 

} 

// Functions to crawl the next page 
function next_page(){ 
    global $pages; 
$k=0; 
    foreach(array_keys($pages) as $k=> $page){ 

     if($pages[$page] == NULL){ 
      $k++; 

      echo "[$k] - "; 
      return $page; 
     } 
    } 
    return NULL; 
} 

function add_urls($page){ // ads new unique urls in to array and checks each url for Server Header Status 
    global $pages, $maxlinks; 

    $start = microtime(); 
    $urls = get_urls($page); 
    $resptime = microtime() - $start; // with microtime it is possible to find out on which page the crowler stops responding. 

    //Start checking for Server Header 
    $ch = curl_init($page); 
    curl_setopt($ch, CURLOPT_NOBODY, 1); 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 

    // Execute 
    curl_exec($ch); 
    $info = curl_getinfo($ch); 

    print "$page"; 

// If the status code os 200, then print OK, else = NO 
//  if($info['http_code']==200) { 
$page1 = ($_POST['page1']); 
$page1data = file_get_contents($page1); 
$page2 = file_get_contents($page); 

$i = similar_text($page1data, $page2, $p); 
$p = round($p, 2); 

     echo ' - Match Percentage:' . $p . '%'; 
//  } else { 
//    echo '<strong style="color:#ba3d00"> NO </strong>';} 

      /* echo substr(($resptime),0,5). " seconds"; */ // Activate ths to see how much time it takes to crawl 
      echo '<br/>'; 

     curl_close($ch); // Close handle 

    $pages[$page] = array ('resptime' => floor($resptime * 9000), 'url' => $page); 

    foreach($urls as $url){ 
     if(!array_key_exists($url, $pages) && !in_array($url, $pages) && count($pages)<$maxlinks){ 
      $pages[$url] = NULL; 
     } 

    } 

} 

echo '[1] - '; // this is for the first input url, as it will be extracted from input 
add_urls($baseurl); 

while(($page= next_page()) != NULL) //while there are urls available 


{ 
add_urls($page); 

} 

    echo '<p class="small">Amount of crawled links: <strong>'.count ($pages).'</strong></p>'; 
    if (count($pages)<$maxlinks) echo '<p class="small">Sorry, no more links to crawl!!</p>';// count all extracted Urls 
} 

?><?php 
function extract_domain_name($url){ 
    /* old domain extractor 
    if(preg_match('@^(?:http:\/\/)?([^\/]+)@i', $url, $matches)) { 
     return trim(strtolower($matches[1])); 
    } else { 
     return ''; 
    }*/ 
    preg_match("/^(http:\/\/)?([^\/]+)/i", $url, $matches); 
    $host = $matches[2]; 
    // get last two segments of host name 
    preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches); 
    return $matches[0]; 

} 

function relative2absolute($absolute, $relative) { 
$p = parse_url($relative); 
if($p["scheme"])return $relative; 
extract(parse_url($absolute)); 
$path = dirname($path); 
if($relative{0} == '/') 
{ 
$newPath = array_filter(explode("/", $relative)); 
} 
else 
{ 
$aparts = array_filter(explode("/", $path)); 
$rparts = array_filter(explode("/", $relative)); 
$cparts = array_merge($aparts, $rparts); 
$k = 0; 
$newPath = array(); 
foreach($cparts as $i => $part) 
{ 
if($part == '..') 
{ 
$k = $k - 1; 
$newPath[$k] = null; 
} 
else 
{ 
$newPath[$k] = $cparts[$i]; 
$k = $k + 1; 
} 
} 
$newPath = array_filter($newPath); 
} 
$path = implode("/", $newPath); 
$url = ""; 
if($scheme) 
{ 
$url = "$scheme://"; 
} 
if($user) 
{ 
$url .= "$user"; 
if($pass) 
{ 
$url .= ":$pass"; 
} 
$url .= "@"; 
} 
if($host) 
{ 
$url .= "$host/"; 
} 
$url .= $path; 
return $url; 
} 

################################################## 

답변

1

약 30 초가 지나면 중지됩니다. 스크립트 맨 위에 다음을 추가하십시오. set_time_limit(0);

일반적으로 PHP 스크립트는 30 초 후에 종료되지만,이를 무시할 수 있습니다.

+0

필자는 각 PHP 구문을 아래에 추가하려고 시도했는데 그 PHP 구문은 여전히 ​​10 – Batfan