NodeJS 크롤러가 사이트에 로그인

geocaching.com을 크롤링하고 싶지만 코드와 같은 일부 데이터는 로그인 한 사용자 만 볼 수 있습니다. 고궁 박물원에서 "크롤러"을 사용하여 임과이 지금 생각 크롤러를 사용하여 로그인하지만 난 이미 로그인 폼의 이름을 가지고하는 방법 :NodeJS 크롤러가 사이트에 로그인

ctl00 $ ContentBody $ tbUsername : 사용자
ctl00 $ ContentBody $ tbPassword : "로그인을 +에서"여기

지금까지 내 코드입니다 :

ctl00 $ ContentBody $ btnSignIn는 passwaord

var Crawler = require("crawler"); 
var url = require('url'); 
var mongoose = require("mongoose"); 
var Cache = require("./models/cache.js"); 

mongoose.connect("localhost:27017/Cache"); 

var removeTags = function(text){ 
    return String(text).replace(/(<([^>]+)>)/ig,''); 
}; 
var c = new Crawler({ 
    maxConnections: 10, 
    skipDuplicates: true, 

    callback: function (error, result, $) { 

     if (result.request.uri.href.startsWith("http://www.geocaching.com/geocache/")) { 
      var cache = new Cache(); 
      var id = removeTags($(".CoordInfoCode")); 
      Cache.count({ 
       "_id": id 
      }, function (err, count) { 
       if (err) 
        return; 
       else if (count < 1) { 
        //Saving the data 
       } 

      }); 


     } 
     if (result.headers['content-type'] == "text/html; charset=utf-8") { 
      if ($('a').length != 0) { 
       $('a').each(function (index, a) { 
        var toQueueUrl = $(a).attr('href'); 
        process.nextTick(function() { 
         process.nextTick(function() { 
          c.queue(toQueueUrl); 
         }) 
        }); 

       }); 
      } 
     } 

    } 
}); 

c.queue('http://www.geocaching.com/seek/nearest.aspx?ul=Die_3sten_3');

출처

2014-11-16 Jhon Smith

-3

github에서 예제 자바 스크립트 크롤러를 만들었습니다.

이벤트 구동 방식이며 모든 리소스 (예 : URL)를 저장하기 위해 메모리 대기열을 사용합니다.

어떻게 그냥 당신에게 자바 스크립트 크롤러의 두 핵심 방법을 보여주는거야 노드 환경 여기

var Crawler = require('../lib/crawler') 
var crawler = new Crawler('http://www.someUrl.com'); 

// crawler.maxDepth = 4; 
// crawler.crawlInterval = 10; 
// crawler.maxListenerCurrency = 10; 
// crawler.redisQueue = true; 
crawler.start();

에서 사용할 수 있습니다.

Crawler.prototype.run = function() { 
    var crawler = this; 
    process.nextTick(() => { 
    //the run loop 
    crawler.crawlerIntervalId = setInterval(() => { 

     crawler.crawl(); 

    }, crawler.crawlInterval); 
    //kick off first one 
    crawler.crawl(); 
    }); 

    crawler.running = true; 
    crawler.emit('start'); 
} 


Crawler.prototype.crawl = function() { 
    var crawler = this; 

    if (crawler._openRequests >= crawler.maxListenerCurrency) return; 


    //go get the item 
    crawler.queue.oldestUnfetchedItem((err, queueItem, index) => { 
    if (queueItem) { 
     //got the item start the fetch 
     crawler.fetchQueueItem(queueItem, index); 
    } else if (crawler._openRequests === 0) { 
     crawler.queue.complete((err, completeCount) => { 
     if (err) 
      throw err; 
     crawler.queue.getLength((err, length) => { 
      if (err) 
      throw err; 
      if (length === completeCount) { 
      //no open Request, no unfetcheditem stop the crawler 
      crawler.emit("complete", completeCount); 
      clearInterval(crawler.crawlerIntervalId); 
      crawler.running = false; 
      } 
     }); 
     }); 
    } 

    }); 
};

여기는 github 링크 https://github.com/bfwg/node-tinycrawler입니다. 1000 줄의 코드로 작성된 자바 스크립트 웹 크롤러입니다. 올바른 길을 찾아야합니다.

출처

2016-10-28 20:16:56

NodeJS 크롤러가 사이트에 로그인

답변

관련 문제