2014-12-16 3 views
5

스탠포드의 프랑스어 POS 태그 태그에서 반환되는 음성 태그를 어떻게 단순화 할 수 있습니까? 이 태그 세트를 단순화하기 위해 map_tag을(), NLTK로 영어 문장을 읽을 연설의 각 단어의 일부를 발견하고 사용하기 매우 쉬운 :NLTK를 사용하여 프랑스어 POS 태그 세트 단순화

#!/usr/bin/python 
# -*- coding: utf-8 -*- 

import os 
from nltk.tag.stanford import POSTagger 
from nltk.tokenize import word_tokenize 
from nltk.tag import map_tag 

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home 
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin" 

english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands." 

path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger" 
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar" 

#define english and french taggers 
english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8") 

#each tuple in list_of_english_pos_tuples = (word, pos) 
list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english)) 

simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples] 

print simplified_pos_tags_english 

#output = [(u'the', u'DET'), (u'whole', u'ADJ'), (u'earth', u'NOUN'), (u'swarms', u'NOUN'), (u'with', u'ADP'), (u'living', u'NOUN'), (u'beings', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'plant', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'grain', u'NOUN'), (u'and', u'CONJ'), (u'leaf', u'NOUN'), (u',', u'.'), (u'supports', u'VERB'), (u'the', u'DET'), (u'life', u'NOUN'), (u'of', u'ADP'), (u'thousands', u'NOUN'), (u'.', u'.')] 

을하지만 반환 프랑스 태그를 매핑하는 방법을 잘 모르겠어요 보편적 인 태그 세트에 다음 코드로 : 스탠포드 POS 술래에서 프랑스 모델에서 사용하는 기본 태그 세트를 단순화하는 방법을

#!/usr/bin/python 
# -*- coding: utf-8 -*- 

import os 
from nltk.tag.stanford import POSTagger 
from nltk.tokenize import word_tokenize 
from nltk.tag import map_tag 

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home 
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin" 

french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés." 

path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger" 
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar" 

french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8") 

list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french)) 

#up to this point all is well, but I'm not sure how to successfully create a simplified pos tagset with the French tuples 
simplified_pos_tags_french = [(word, map_tag('SOME_ARGUMENT', 'universal', tag)) for word, tag in list_of_french_pos_tuples] 
print simplified_pos_tags_french 

사람이 알고 있나요? 다른 사람들이이 질문에 제공 할 수있는 통찰력에 대해 감사하게 생각합니다.

답변

8

필자는 Stanford의 POS 태그를 유니버설 태그 세트에 수동으로 매핑하는 결과를 보았습니다. 가치가있는 부분에 대해서는 위의 스 니펫이 프랑스어와 영어 문장 사이의 구문 유사성을 측정하기위한 약간 더 큰 워크 플로의 일부였습니다. 다른 사람들을 돕기 위해 전체 코드는 다음과 같습니다.

#!/usr/bin/python 
# -*- coding: utf-8 -*- 

'''NLTK 3.0 offers map_tag, which maps the Penn Treebank Tag Set to the Universal Tagset, a course tag set with the following 12 tags: 

VERB - verbs (all tenses and modes) 
NOUN - nouns (common and proper) 
PRON - pronouns 
ADJ - adjectives 
ADV - adverbs 
ADP - adpositions (prepositions and postpositions) 
CONJ - conjunctions 
DET - determiners 
NUM - cardinal numbers 
PRT - particles or other function words 
X - other: foreign words, typos, abbreviations 
. - punctuation 

We'll map Stanford's tag set to this tag set then compare the similarity between subregions of French and English sentences.''' 

from __future__ import division 
import os, math 
from nltk.tag.stanford import POSTagger 
from nltk.tokenize import word_tokenize 
from nltk.tag import map_tag 
from collections import Counter 

######################### 
# Create Tagset Mapping # 
######################### 

def create_french_to_universal_dict(): 
    '''this function creates the dict we'll call below when we map french pos tags to the universal tag set''' 
    french_to_universal = {} 
    french_to_universal[u"ADJ"] = u"ADJ" 
    french_to_universal[u"ADJWH"] = u"ADJ" 
    french_to_universal[u"ADV"] = u"ADV" 
    french_to_universal[u"ADVWH"] = u"ADV" 
    french_to_universal[u"CC"]  = u"CONJ"  
    french_to_universal[u"CLO"] = u"PRON" 
    french_to_universal[u"CLR"] = u"PRON" 
    french_to_universal[u"CLS"] = u"PRON" 
    french_to_universal[u"CS"]  = u"CONJ" 
    french_to_universal[u"DET"] = u"DET" 
    french_to_universal[u"DETWH"] = u"DET" 
    french_to_universal[u"ET"]  = u"X" 
    french_to_universal[u"NC"]  = u"NOUN" 
    french_to_universal[u"NPP"] = u"NOUN" 
    french_to_universal[u"P"]  = u"ADP" 
    french_to_universal[u"PUNC"] = u"." 
    french_to_universal[u"PRO"] = u"PRON" 
    french_to_universal[u"PROREL"] = u"PRON" 
    french_to_universal[u"PROWH"] = u"PRON" 
    french_to_universal[u"V"]  = u"VERB" 
    french_to_universal[u"VIMP"] = u"VERB" 
    french_to_universal[u"VINF"] = u"VERB" 
    french_to_universal[u"VPP"] = u"VERB" 
    french_to_universal[u"VPR"] = u"VERB" 
    french_to_universal[u"VS"]  = u"VERB" 
    #nb, I is not part of the universal tagset--interjections get mapped to X 
    french_to_universal[u"I"]  = u"X" 
    return french_to_universal 

french_to_universal_dict = create_french_to_universal_dict() 

def map_french_tag_to_universal(list_of_french_tag_tuples): 
    '''this function reads in a list of tuples (word, pos) and returns the same list with pos mapped to universal tagset''' 
    return [ (tup[0], french_to_universal_dict[ tup[1] ]) for tup in list_of_french_tag_tuples ] 

############################### 
# Define Similarity Functions # 
############################### 

def counter_cosine_similarity(c1, c2): 
    '''this function reads in two counters and returns their cosine similarity''' 
    terms = set(c1).union(c2) 
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms) 
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms)) 
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms)) 
    return dotprod/(magA * magB) 

def longest_common_subsequence_length(a, b): 
    '''this function reads in two lists and returns the length of their longest common subsequence''' 
    table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)] 
    for i, ca in enumerate(a, 1): 
     for j, cb in enumerate(b, 1): 
      table[i][j] = (
       table[i - 1][j - 1] + 1 if ca == cb else 
       max(table[i][j - 1], table[i - 1][j])) 
    return table[-1][-1]   

def longest_contiguous_subsequence_length(a, b): 
    '''this function reads in two lists and returns the length of their longest contiguous subsequence''' 
    table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)] 
    l = 0 
    for i, ca in enumerate(a, 1): 
     for j, cb in enumerate(b, 1): 
      if ca == cb: 
       table[i][j] = table[i - 1][j - 1] + 1 
       if table[i][j] > l: 
        l = table[i][j] 
    return l 

def calculate_syntactic_similarity(french_pos_tuples, english_pos_tuples): 
    '''this function reads in two lists of (word, pos) tuples and returns their cosine similarity, logest_common_subsequence, and longest_common_contiguous_sequence''' 
    french_pos_list   = [tup[1] for tup in french_pos_tuples] 
    english_pos_list   = [tup[1] for tup in english_pos_tuples] 
    french_pos_counter  = Counter(french_pos_list) 
    english_pos_counter  = Counter(english_pos_list) 
    cosine_similarity   = counter_cosine_similarity(french_pos_counter, english_pos_counter) 
    lc_subsequence   = longest_common_subsequence_length(french_pos_counter, english_pos_counter)/max(len(french_pos_list), len(english_pos_list)) 
    lc_contiguous_subsequence = longest_contiguous_subsequence_length(french_pos_counter, english_pos_counter)/max(len(french_pos_list), len(english_pos_list)) 
    return cosine_similarity, lc_subsequence, lc_contiguous_subsequence 

########################### 
# Parse POS with Stanford # 
########################### 

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home 
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin" 

english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands." 
french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés." 

#specify paths 
path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger" 
path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger" 
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar" 

#define english and french taggers 
english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8") 
french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8") 

#each tuple in list_of_english_pos_tuples = (word, pos) 
list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english)) 
list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french)) 

#simplify each tagset 
simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples] 
simplified_pos_tags_french = map_french_tag_to_universal(list_of_french_pos_tuples) 

print calculate_syntactic_similarity(simplified_pos_tags_french, simplified_pos_tags_english) 
+1

감사합니다! NLTK 사람들은 Stanford 태그 세트 ("Crabbe and Candito")에서 보편적 인 태그 세트로의 매핑에 관심이있을 수 있습니다. –

+0

내 기쁨! 필자는 향후 릴리스에서이 맵핑을 포함 할 수 있도록 어떤 시점에서 풀 요청을 시도하고 작성합니다. – duhaime

+1

@duhaime, 감사드립니다. 귀하의 매핑을 촬영하고 Universal POS 태그 프로젝트 (https://github.com/slavpetrov/universal-pos-tags/pull/12)에 기여할 수있는 요청을 작성하여 너와이 SO 페이지. –

관련 문제