2013-09-24 4 views
0

gensim 라이브러리에는 클래스가있어 matrix market format 파일을 파이썬 개체로 변환합니다. 때로는 transpose the matrix이 필요하므로 전치 매개 변수가 MmReader에 도입되었습니다.gensim - Python의 매트릭스 시장 형식의 전치 매개 변수

는 그러나, 나는 선 525-526https://github.com/piskvorky/gensim/blob/develop/gensim/matutils.py567-568에서, 용어 - 문서 값과 ID의 반전이 때 transposed == False 발생하는 것이 그 것이다 이유에 대해 혼란 스러워요.

지식 검색에서 용어 - 문서 행렬에 익숙한 사람이라면 누구든지 나를 계몽 할 수 있습니까? mmreadermmwriter의 형식이 동일한 경우

class MmReader(object): 
    """ 
    Wrap a term-document matrix on disk (in matrix-market format), and present it 
    as an object which supports iteration over the rows (~documents). 

    Note that the file is read into memory one document at a time, not the whole 
    matrix at once (unlike scipy.io.mmread). This allows us to process corpora 
    which are larger than the available RAM. 
    """ 
    def __init__(self, input, transposed=True): 
     """ 
     Initialize the matrix reader. 

     The `input` refers to a file on local filesystem, which is expected to 
     be in the sparse (coordinate) Matrix Market format. Documents are assumed 
     to be rows of the matrix (and document features are columns). 

     `input` is either a string (file path) or a file-like object that supports 
     `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). 
     """ 
     logger.info("initializing corpus reader from %s" % input) 
     self.input, self.transposed = input, transposed 
     if isinstance(input, basestring): 
      input = open(input) 
     header = input.next().strip() 
     if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): 
      raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % 
          (self.input, header)) 
     self.num_docs = self.num_terms = self.num_nnz = 0 
     for lineno, line in enumerate(input): 
      if not line.startswith('%'): 
       self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) 
       if not self.transposed: ## line 525 
        self.num_docs, self.num_terms = self.num_terms, self.num_docs 
       break 
     logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % 
        (self.num_docs, self.num_terms, self.num_nnz)) 

    def __len__(self): 
     return self.num_docs 

    def __str__(self): 
     return ("MmCorpus(%i documents, %i features, %i non-zero entries)" % 
       (self.num_docs, self.num_terms, self.num_nnz)) 

    def skip_headers(self, input_file): 
     """ 
     Skip file headers that appear before the first document. 
     """ 
     for line in input_file: 
      if line.startswith('%'): 
       continue 
      break 

    def __iter__(self): 
     """ 
     Iteratively yield vectors from the underlying file, in the format (row_no, vector), 
     where vector is a list of (col_no, value) 2-tuples. 

     Note that the total number of vectors returned is always equal to the 
     number of rows specified in the header; empty documents are inserted and 
     yielded where appropriate, even if they are not explicitly stored in the 
     Matrix Market file. 
     """ 
     if isinstance(self.input, basestring): 
      fin = open(self.input) 
     else: 
      fin = self.input 
      fin.seek(0) 
     self.skip_headers(fin) 

     previd = -1 
     for line in fin: 
      docid, termid, val = line.split() 
      if not self.transposed: 
       termid, docid = docid, termid 
      docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based 
      assert previd <= docid, "matrix columns must come in ascending order" 
      if docid != previd: 
       # change of document: return the document read so far (its id is prevId) 
       if previd >= 0: 
        yield previd, document 

       # return implicit (empty) documents between previous id and new id 
       # too, to keep consistent document numbering and corpus length 
       for previd in xrange(previd + 1, docid): 
        yield previd, [] 

       # from now on start adding fields to a new document, with a new id 
       previd = docid 
       document = [] 

      document.append((termid, val,)) # add another field to the current document 

     # handle the last document, as a special case 
     if previd >= 0: 
      yield previd, document 

     # return empty documents between the last explicit document and the number 
     # of documents as specified in the header 
     for previd in xrange(previd + 1, self.num_docs): 
      yield previd, [] 


    def docbyoffset(self, offset): 
     """Return document at file offset `offset` (in bytes)""" 
     # empty documents are not stored explicitly in MM format, so the index marks 
     # them with a special offset, -1. 
     if offset == -1: 
      return [] 
     if isinstance(self.input, basestring): 
      fin = open(self.input) 
     else: 
      fin = self.input 

     fin.seek(offset) # works for gzip/bz2 input, too 
     previd, document = -1, [] 
     for line in fin: 
      docid, termid, val = line.split() 
      if not self.transposed: ## line 567 
       termid, docid = docid, termid 
      docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based 
      assert previd <= docid, "matrix columns must come in ascending order" 
      if docid != previd: 
       if previd >= 0: 
        return document 
       previd = docid 

      document.append((termid, val,)) # add another field to the current document 
     return document 
#endclass MmReader 

답변