2011-03-04 5 views
0

일부 결과를 강조하려고합니다. 나는 "내용"필드에서 내 문서의 본문 (텍스트)을 색인화하고 highlighter.getBestFragment (...)를 사용하여 최고로 고치려고하면 NullPointerException이 발생합니다.Lucene, 강조 표시 및 NullPointerException

그러나 예를 들어 fileName을 강조하려고하면 제대로 작동합니다. fileReader 또는 (ParsingReader)에서 하나의 필드 만 사용하므로 파일 이름과 다른 텍스트가 토큰 화됩니다.

여기 내 코드가 있습니다. 제발 도와주세요.

package xxxxxx; 

import java.io.File; 
import java.io.FileFilter; 
import java.io.FileReader; 
import java.io.IOException; 
import java.io.Reader; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.IndexWriter; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 
import org.apache.lucene.util.Version; 
import org.apache.tika.parser.ParsingReader; 

public class Indexer { 

    static long start = 0; 

    public static void main(String[] args) throws Exception { 
     System.out.println("l'index se trouve à " + args[0]); 
     System.out.println("le dossier ou s'effectue l'indexation est :" + args[1]); 
     if (args.length != 2) { 
      throw new IllegalArgumentException("Usage: java " + Indexer.class.getName() 
        + " <index dir> <data dir>"); 
     } 

     String indexDir = args[0]; 
     String dataDir = args[1]; 


     start = System.currentTimeMillis(); 
     Indexer indexer = new Indexer(indexDir); 
     int numIndexed; 
     try { 
      numIndexed = indexer.index(dataDir, new TextFilesFilter()); 


     } finally { 

      indexer.close(); 
     } 

     long end = System.currentTimeMillis(); 
     System.out.println("Indexing " + numIndexed + " files took " 
       + (end - start) + " milliseconds"); 
    } 
    private IndexWriter writer; 

    public Indexer(String indexDir) throws IOException, InterruptedException { 
     Directory dir = FSDirectory.open(new File(indexDir)); 

     writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30), true, 
       IndexWriter.MaxFieldLength.UNLIMITED); 
     writer.setUseCompoundFile(true); 
    } 

    public void close() throws IOException { 
     writer.optimize(); 
     writer.close(); 
    } 

    public int index(String dataDir, FileFilter filter) throws Exception { 

     File[] files = new File(dataDir).listFiles(); 

     for (File f : files) { 

      if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) { 

       if (!(f.getCanonicalPath().endsWith("~"))) { 
        indexFile(f); 
       } 
      } else { 
       index(f.toString(), filter); 
      } 
     } 
     return writer.numDocs(); 
    } 

    private static class TextFilesFilter implements FileFilter { 

     public boolean accept(File path) { 
      return true; 
     } 
    } 

    protected Document getDocument(File f) throws Exception { 
     // FileReader frf = new FileReader(f); 
     Document doc = new Document(); 
     Reader reader = new ParsingReader(f); 

     doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); 
     doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED)); 
     doc.add(new Field("fullpath", f.getCanonicalPath(),Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); 
     return doc; 
    } 

    private void indexFile(File f) throws Exception { 
     System.out.println("Indexing " + f.getCanonicalPath()); 
     Document doc = getDocument(f); 
     writer.addDocument(doc); 
     System.out.println(System.currentTimeMillis() - start); 
    } 
} 

------------------------------------------------------------------- 



    package xxxxxxxxxxxxxxxxxxxx; 

import java.io.File; 
import java.io.IOException; 
import java.util.ArrayList; 
import java.util.List; 
import org.apache.lucene.analysis.TokenStream; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.queryParser.MultiFieldQueryParser; 
import org.apache.lucene.queryParser.ParseException; 
import org.apache.lucene.queryParser.QueryParser; 
import org.apache.lucene.search.DisjunctionMaxQuery; 
import org.apache.lucene.search.IndexSearcher; 
import org.apache.lucene.search.Query; 
import org.apache.lucene.search.ScoreDoc; 
import org.apache.lucene.search.TopDocs; 
import org.apache.lucene.search.highlight.Highlighter; 
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; 
import org.apache.lucene.search.highlight.QueryScorer; 
import org.apache.lucene.search.highlight.SimpleSpanFragmenter; 
import org.apache.lucene.search.highlight.TokenSources; 
import org.apache.lucene.store.Directory; 
import org.apache.lucene.store.FSDirectory; 
import org.apache.lucene.util.Version; 

public class Searcher { 

    public static void main(String[] args) throws IllegalArgumentException, 
      IOException, ParseException, InvalidTokenOffsetsException { 
     System.out.println("endroit ou se situe l'index " + args[0]); 
     System.out.println(args[1]); 
     if (args.length != 2) { 
      throw new IllegalArgumentException("Usage: java " 
        + Searcher.class.getName() 
        + " <index dir> <query>"); 
     } 

     String indexDir = args[0]; 
     String q = args[1]; 
     search(indexDir, q); 
    } 


    public static void search(String indexDir, String q) throws IOException, ParseException, InvalidTokenOffsetsException { 
     Directory dir = FSDirectory.open(new File(indexDir)); 
     IndexSearcher indexSearcher = new IndexSearcher(dir); 
     QueryParser parserC = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30)); 
    //  QueryParser parserN = new QueryParser(Version.LUCENE_30, "filename", new StandardAnalyzer(Version.LUCENE_30)); 
     QueryParser parserP = new QueryParser(Version.LUCENE_30, "fullpath", new StandardAnalyzer(Version.LUCENE_30)); 
     parserC.setDefaultOperator(QueryParser.Operator.OR); 
    // parserN.setDefaultOperator(QueryParser.Operator.OR); 
     parserC.setPhraseSlop(10); 
     // parserN.setPhraseSlop(10); 
     DisjunctionMaxQuery dmq = new DisjunctionMaxQuery(6); 

     Query query = new MultiFieldQueryParser(Version.LUCENE_30, new String[]{"contents", "filename"}, 
       new CustomAnalyzer()).parse(q); 

     Query queryC = parserC.parse(q); 
     //Query queryN = parserN.parse(q); 
     dmq.add(queryC); 
     //dmq.add(queryN); 
     //  dmq.add(query)  ; 
     QueryScorer scorer = new QueryScorer(dmq, "contents"); 
     Highlighter highlighter = new Highlighter(scorer); 
     highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer)); 


     System.out.println(query.toString()); 
     long start = System.currentTimeMillis(); 
     TopDocs hits = indexSearcher.search(dmq, 15); 
     System.out.println(hits.totalHits); 
     long end = System.currentTimeMillis(); 
     System.err.println("Found " + hits.totalHits 
       + " document(s) (in " + (end - start) 
       + " milliseconds) that matched query '" 
       + q + "':"); 

     for (ScoreDoc scoreDoc : hits.scoreDocs) { 

      Document doc = indexSearcher.doc(scoreDoc.doc); 
      System.out.print(scoreDoc.score); 
      System.out.println(doc.get("fullpath")); 

String contents = doc.get("contents"); // I am pretty sure the mistake is here , contents is always Null 
//But what can I do to make this thing work ? 
      TokenStream stream = 
        TokenSources.getAnyTokenStream(indexSearcher.getIndexReader(), 
        scoreDoc.doc, 
        "contents", 
        doc, 
        new StandardAnalyzer(Version.LUCENE_30)); 
      String fragment = 
        highlighter.getBestFragment(stream, contents); 
      System.out.println(fragment); 
     } 
     indexSearcher.close(); 
    } 
} 

---------------------------------------------------------------------- 

답변

0

하이 라이터를 사용하려면 저장해야합니다. "filename"은 저장되지만 "내용"은 그렇지 않습니다. 따라서 다른 행동을하는 것을 볼 수 있습니다 :

doc.add(new Field("contents", reader, Field.TermVector.WITH_POSITIONS_OFFSETS)); 
    doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.ANALYZED));