gensim
라이브러리에는 클래스가있어 matrix market format 파일을 파이썬 개체로 변환합니다. 때로는 transpose the matrix이 필요하므로 전치 매개 변수가 MmReader
에 도입되었습니다.gensim - Python의 매트릭스 시장 형식의 전치 매개 변수
는 그러나, 나는 선 525-526
및 https://github.com/piskvorky/gensim/blob/develop/gensim/matutils.py의 567-568
에서, 용어 - 문서 값과 ID의 반전이 때 transposed == False
발생하는 것이 그 것이다 이유에 대해 혼란 스러워요.
지식 검색에서 용어 - 문서 행렬에 익숙한 사람이라면 누구든지 나를 계몽 할 수 있습니까? mmreader
및 mmwriter
의 형식이 동일한 경우
class MmReader(object):
"""
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Note that the file is read into memory one document at a time, not the whole
matrix at once (unlike scipy.io.mmread). This allows us to process corpora
which are larger than the available RAM.
"""
def __init__(self, input, transposed=True):
"""
Initialize the matrix reader.
The `input` refers to a file on local filesystem, which is expected to
be in the sparse (coordinate) Matrix Market format. Documents are assumed
to be rows of the matrix (and document features are columns).
`input` is either a string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
"""
logger.info("initializing corpus reader from %s" % input)
self.input, self.transposed = input, transposed
if isinstance(input, basestring):
input = open(input)
header = input.next().strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header))
self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(input):
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
if not self.transposed: ## line 525
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
(self.num_docs, self.num_terms, self.num_nnz))
def __len__(self):
return self.num_docs
def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))
def skip_headers(self, input_file):
"""
Skip file headers that appear before the first document.
"""
for line in input_file:
if line.startswith('%'):
continue
break
def __iter__(self):
"""
Iteratively yield vectors from the underlying file, in the format (row_no, vector),
where vector is a list of (col_no, value) 2-tuples.
Note that the total number of vectors returned is always equal to the
number of rows specified in the header; empty documents are inserted and
yielded where appropriate, even if they are not explicitly stored in the
Matrix Market file.
"""
if isinstance(self.input, basestring):
fin = open(self.input)
else:
fin = self.input
fin.seek(0)
self.skip_headers(fin)
previd = -1
for line in fin:
docid, termid, val = line.split()
if not self.transposed:
termid, docid = docid, termid
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
# change of document: return the document read so far (its id is prevId)
if previd >= 0:
yield previd, document
# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
for previd in xrange(previd + 1, docid):
yield previd, []
# from now on start adding fields to a new document, with a new id
previd = docid
document = []
document.append((termid, val,)) # add another field to the current document
# handle the last document, as a special case
if previd >= 0:
yield previd, document
# return empty documents between the last explicit document and the number
# of documents as specified in the header
for previd in xrange(previd + 1, self.num_docs):
yield previd, []
def docbyoffset(self, offset):
"""Return document at file offset `offset` (in bytes)"""
# empty documents are not stored explicitly in MM format, so the index marks
# them with a special offset, -1.
if offset == -1:
return []
if isinstance(self.input, basestring):
fin = open(self.input)
else:
fin = self.input
fin.seek(offset) # works for gzip/bz2 input, too
previd, document = -1, []
for line in fin:
docid, termid, val = line.split()
if not self.transposed: ## line 567
termid, docid = docid, termid
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
if previd >= 0:
return document
previd = docid
document.append((termid, val,)) # add another field to the current document
return document
#endclass MmReader