코딩 한 기능으로 전달할 데이터에 대해 언급하지 않았습니다. 몇 가지 예
다음 프로그램이 위치가 여기에 데이터가 I 입력이 어떤 형식 언급하는 것을 잊었다 http://nlp.stanford.edu/IR-book/html/htmledition/positional-indexes-1.html
#Document Frequency
document_frequecy_data_1 = { "word" : "to" ,
"frequency" : 993427,
"info" :
[ { "document_id" : 1 ,
"occurrence_count" : 6 ,
"positions" : [ 7, 18, 33, 72, 86, 231]
},
{ "document_id" : 2 ,
"occurrence_count" : 5 ,
"positions" : [ 1, 17, 74, 222, 255]
},
{ "document_id" : 4 ,
"occurrence_count" : 5 ,
"positions" : [ 8, 16 , 190, 429, 433]
},
{ "document_id" : 5 ,
"occurrence_count" : 2 ,
"positions" : [ 363, 367 ]
},
{ "document_id" : 7 ,
"occurrence_count" : 3 ,
"positions" : [ 13, 23, 191]
}
]
}
document_frequecy_data_2 = { "word" : "be" ,
"frequency" : 178239,
"info" :
[ { "document_id" : 1 ,
"occurrence_count" : 2,
"positions" : [ 17, 25]
},
{ "document_id" : 4 ,
"occurrence_count" : 5 ,
"positions" : [ 17, 191, 291, 430, 434]
},
{ "document_id" : 5 ,
"occurrence_count" : 3 ,
"positions" : [ 14, 19, 101 ]
}
]
}
def pos_intersect(data_1, data_2, k =1):
answer = []
data_info_1 = data_1["info"]
data_info_2 = data_2["info"]
i = 0
j = 0
while (i < len(data_info_1) and j < len(data_info_2)):
document_id_1 = data_info_1[i]["document_id"]
document_id_2 = data_info_2[j]["document_id"]
if (document_id_1 == document_id_2):
pos_res_list = []
pos_list_1 = data_info_1[i]["positions"]
pos_list_2 = data_info_2[j]["positions"]
k = 0
while (k < len(pos_list_1)):
l = 0
while (l < len(pos_list_2)) :
distance = abs(pos_list_1[k] - pos_list_2[l])
if (distance <= k):
pos_res_list.append(l)
elif pos_list_2[l] > pos_list_1[k]:
break
l = l + 1
for item in pos_res_list:
distance = abs(pos_list_2[item] - pos_list_1[k])
if distance > k :
pos_res_list.remove(item)
for item in pos_res_list:
answer.append({ "document_id" : document_id_1, "position_data_1" : pos_list_1[k] , "position_data_2" : pos_list_2[item] } )
k = k + 1
i = i + 1
j = j + 1
else:
if document_id_1 < document_id_2:
i = i + 1
else:
j = j + 1
return answer
results = pos_intersect(document_frequecy_data_1, document_frequecy_data_2, 4)
print "Results : "
for res in results:
print "Document id :" , res["document_id"] , " Position 1: " , res["position_data_1" ], " Position 2 :", res["position_data_1" ]
링크에 주어진 형식으로 제공 될 때 교차하지만 발견에 관한 것입니다 수 있습니다 >>> phrase_intersect ([[0, 1, 4], [1, 0], [10,2,3,4]], \ [[0, 2, 6], [1, 2], [2, 출력] -> [[0, 2], [10, 5]] >>> phrase_intersect ([[1, 2]], [[1, 4]] 출력 - [] –