저는 음성 인식 응용 프로그램에 sphinx4와 결합 된 HTk를 사용하려고했습니다. 내 입력을 wav 파일로 제공하고 있는데 스핑크스를 사용하여 "Transcriber demo"와 "Lattice Demo"를 제공했지만 출력은 거의 용납 할 수 없었습니다. 그래서 저는 Sphinx4로 HTK를 소개하기로 결정했습니다. 그러나 결과 출력은 멀리 떨어져있는 것처럼 보입니다. 구성이 더 조정될 수 있다고 확신합니다. 내가하는 일보다. 나는 htk와 sphinx4를 사용하는 ny 튜토리얼이 있는지 알아 내기 위해 잘 조사했다. 이 놀라운 블로그 (http://nsh.nexiwave.com/2009/09/using-htk-models-in-sphinx4.html) 외에도 다른 것을 찾지 못했습니다.숫자를 인식하지 못하도록 htk 사용
<?xml version="1.0" encoding="UTF-8"?>
<!--
Sphinx-4 Configuration file
-->
<!-- ******************************************************** -->
<!-- an4 configuration file -->
<!-- ******************************************************** -->
<config>
<!-- ******************************************************** -->
<!-- frequently tuned properties -->
<!-- ******************************************************** -->
<property name="logLevel" value="WARNING"/>
<property name="absoluteBeamWidth" value="-1"/>
<property name="relativeBeamWidth" value="1E-80"/>
<property name="wordInsertionProbability" value="1E-36"/>
<property name="languageWeight" value="8"/>
<property name="frontend" value="epFrontEnd"/>
<property name="recognizer" value="recognizer"/>
<property name="showCreations" value="false"/>
<!-- ******************************************************** -->
<!-- word recognizer configuration -->
<!-- ******************************************************** -->
<component name="recognizer" type="edu.cmu.sphinx.recognizer.Recognizer">
<property name="decoder" value="decoder"/>
<propertylist name="monitors">
<item>accuracyTracker </item>
<item>speedTracker </item>
<item>memoryTracker </item>
</propertylist>
</component>
<!-- ******************************************************** -->
<!-- The Decoder configuration -->
<!-- ******************************************************** -->
<component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">
<property name="searchManager" value="searchManager"/>
</component>
<!-- <component name="searchManager"
type="edu.cmu.sphinx.decoder.search.SimpleBreadthFirstSearchManager">
<property name="logMath" value="logMath"/>
<property name="linguist" value="lexTreeLinguist"/>
<property name="pruner" value="trivialPruner"/>
<property name="scorer" value="threadedScorer"/>
<property name="activeListFactory" value="activeList"/>
</component>
<component name="activeList"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
-->
<component name="searchManager"
type="edu.cmu.sphinx.decoder.search.WordPruningBreadthFirstSearchManager">
<property name="logMath" value="logMath"/>
<property name="linguist" value="lexTreeLinguist"/>
<property name="pruner" value="trivialPruner"/>
<property name="scorer" value="threadedScorer"/>
<property name="activeListManager" value="activeListManager"/>
<property name="activeListFactory" value="activeList"/>
<property name="growSkipInterval" value="0"/>
<property name="checkStateOrder" value="false"/>
<property name="buildWordLattice" value="false"/>
<property name="acousticLookaheadFrames" value="1.7"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
<component name="trivialPruner"
type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>
<component name="threadedScorer"
type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">
<property name="frontend" value="${frontend}"/>
</component>
<component name="activeListManager"
type="edu.cmu.sphinx.decoder.search.SimpleActiveListManager">
<propertylist name="activeListFactories">
<item>standardActiveListFactory</item>
<item>wordActiveListFactory</item>
<item>wordActiveListFactory</item>
<item>standardActiveListFactory</item>
<item>standardActiveListFactory</item>
<item>standardActiveListFactory</item>
</propertylist>
</component>
<component name="standardActiveListFactory"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
<component name="wordActiveListFactory"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteWordBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeWordBeamWidth}"/>
</component>
<!-- ******************************************************** -->
<!-- The linguist configuration -->
<!-- ******************************************************** -->
<component name="flatLinguist"
type="edu.cmu.sphinx.linguist.flat.FlatLinguist">
<property name="logMath" value="logMath"/>
<property name="grammar" value="jsgfGrammar"/>
<property name="acousticModel" value="wsj"/>
<property name="wordInsertionProbability"
value="${wordInsertionProbability}"/>
<property name="languageWeight" value="${languageWeight}"/>
<property name="unitManager" value="unitManager"/>
</component>
<!-- ******************************************************** -->
<!-- The Grammar configuration -->
<!-- ******************************************************** -->
<component name="jsgfGrammar" type="edu.cmu.sphinx.jsgf.JSGFGrammar">
<property name="dictionary" value="dictionary"/>
<property name="grammarLocation"
value="resource:/edu/cmu/sphinx/demo/transcriber/"/>
<property name="grammarName" value="digits"/>
<property name="logMath" value="logMath"/>
</component>
<!-- ******************************************************** -->
<!-- The Dictionary configuration
<component name="dictionary"
type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
<property name="dictionaryPath"
value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/dict/cmudict.0.6d"/>
<property name="fillerPath"
value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/noisedict"/>
<property name="addSilEndingPronunciation" value="false"/>
<property name="wordReplacement" value="<sil>"/>
<property name="unitManager" value="unitManager"/>
</component> -->
<!-- ******************************************************** -->
<!-- ******************************************************** -->
<!-- The Dictionary configuration -->
<!-- ******************************************************** -->
<component name="dictionary"
type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
<property name="dictionaryPath"
value="file:C:\Raveesh\Softwares\apache-tomcat-6.0.32\apache-tomcat-6.0.32\bin\models\language\wsj\5100.dic"/>
<property name="fillerPath"
value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz/noisedict"/>
<property name="addSilEndingPronunciation" value="false"/>
<property name="wordReplacement" value="<sil>"/>
<property name="unitManager" value="unitManager"/>
</component>
<!-- ******************************************************** -->
<!-- The acoustic model configuration -->
<!-- ******************************************************** -->
<component name="wsj"
type="edu.cmu.sphinx.linguist.acoustic.tiedstate.TiedStateAcousticModel">
<property name="loader" value="wsjLoader"/>
<property name="unitManager" value="unitManager"/>
</component>
<component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.HTKLoader">
<property name="logMath" value="logMath"/>
<property name="modelDefinition" value="hmmdefs"/>
<property name="unitManager" value="unitManager"/>
</component>
<!--
<component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.Sphinx3Loader">
<property name="logMath" value="logMath"/>
<property name="unitManager" value="unitManager"/>
<property name="location" value="resource:/WSJ_8gau_13dCep_16k_40mel_130Hz_6800Hz"/>
</component>
-->
<!-- ******************************************************** -->
<!-- The unit manager configuration -->
<!-- ******************************************************** -->
<component name="unitManager"
type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>
<!-- ******************************************************** -->
<!-- The live frontend configuration -->
<!-- ******************************************************** -->
<!--
<component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
<propertylist name="pipeline">
<item>audioFileDataSource </item>
<item>dataBlocker </item>
<item>speechClassifier </item>
<item>speechMarker </item>
<item>nonSpeechDataFilter </item>
<item>preemphasizer </item>
<item>windower </item>
<item>fft </item>
<item>melFilterBank </item>
<item>dct </item>
<item>liveCMN </item>
<item>featureExtraction </item>
</propertylist>
</component>
-->
<!-- the front end configuration using the HTK loader.. -->
<component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
<propertylist name="pipeline">
<item>streamHTKSource</item>
</propertylist>
</component>
<component name="streamHTKSource" type="edu.cmu.sphinx.frontend.util.StreamHTKCepstrum">
<property name="cepstrumLength" value="39"/>
</component>
<!-- ******************************************************** -->
<!-- The frontend pipelines -->
<!-- ******************************************************** -->
<component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/>
<component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker"/>
<component name="speechClassifier" type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier"/>
<component name="nonSpeechDataFilter"
type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>
<component name="speechMarker" type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker" />
<component name="preemphasizer"
type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>
<component name="windower"
type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower">
</component>
<component name="fft"
type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform">
</component>
<component name="melFilterBank"
type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank">
</component>
<component name="dct"
type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>
<component name="liveCMN"
type="edu.cmu.sphinx.frontend.feature.LiveCMN"/>
<component name="featureExtraction"
type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>
<!-- Newly Added.. -->
<component name="streamDataSource"
type="edu.cmu.sphinx.frontend.util.StreamDataSource">
<property name="sampleRate" value="16000"/>
<property name="bigEndianData" value="false"/>
</component>
<!-- ******************************************************* -->
<!-- monitors -->
<!-- ******************************************************* -->
<component name="accuracyTracker"
type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showAlignedResults" value="true"/>
<property name="showRawResults" value="true"/>
</component>
<component name="memoryTracker"
type="edu.cmu.sphinx.instrumentation.MemoryTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showSummary" value="false"/>
<property name="showDetails" value="false"/>
</component>
<component name="speedTracker"
type="edu.cmu.sphinx.instrumentation.SpeedTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="frontend" value="${frontend}"/>
<property name="showSummary" value="true"/>
<property name="showDetails" value="false"/>
</component>
<!-- ******************************************************* -->
<!-- Miscellaneous components -->
<!-- ******************************************************* -->
<component name="logMath" type="edu.cmu.sphinx.util.LogMath">
<property name="logBase" value="1.0001"/>
<property name="useAddTable" value="true"/>
</component>
<!-- ******************************************************** -->
<!-- The linguist configuration -->
<!-- ******************************************************** -->
<component name="lexTreeLinguist"
type="edu.cmu.sphinx.linguist.lextree.LexTreeLinguist">
<property name="logMath" value="logMath"/>
<property name="acousticModel" value="wsj"/>
<property name="languageModel" value="trigramModel"/>
<property name="dictionary" value="dictionary"/>
<property name="addFillerWords" value="false"/>
<property name="fillerInsertionProbability" value="1E-10"/>
<property name="generateUnitStates" value="false"/>
<property name="wantUnigramSmear" value="true"/>
<property name="unigramSmearWeight" value="1"/>
<property name="wordInsertionProbability"
value="${wordInsertionProbability}"/>
<property name="silenceInsertionProbability"
value="${silenceInsertionProbability}"/>
<property name="languageWeight" value="${languageWeight}"/>
<property name="unitManager" value="unitManager"/>
</component>
<!-- ******************************************************** -->
<!-- The Language Model configuration -->
<!-- ******************************************************** -->
<component name="trigramModel"
type="edu.cmu.sphinx.linguist.language.ngram.SimpleNGramModel">
<property name="location"
value="file:C:\Raveesh\Softwares\apache-tomcat-6.0.32\apache-tomcat-6.0.32\bin\models\language\wsj\5100.lm"/>
<property name="logMath" value="logMath"/>
<property name="dictionary" value="dictionary"/>
<property name="maxDepth" value="3"/>
<property name="unigramWeight" value=".7"/>
</component>
</config>
어떤 도움이 정말 감사하겠습니다
예, 블로그에서 그것에 대해 읽었습니다. 현재 나는 sphinx4 성능 향상을 찾고 있습니다. 하지만 머리에 손톱이 맞지 않는 것 같습니다. HTK를 떨어 뜨리기로 결정했다고 가정 해 봅시다. Sphinc 설정 파일 만 개선 해보시겠습니까? 나는 최근에 그가 빔 폭을 가지고 놀기 시작했다. –
나는 http://cmusphinx.sourceforge.net/wiki/sphinx4에 언급 된 모든 성능 전략을 확인했다. : 큰 어휘 성능 최적화. 그러나 그들은 transcriber 데모에 대한 좋은 인식 결과를 산출 할 수 없다. 접근 방법에 대한 도움이 필요합니까? –
안녕하세요. 성능 향상은 여러 가지 가능한 테스트 방법을 사용하는 복잡한 작업입니다. 우선 우리의 현재 성과가 실제로 무엇인지 알아 내야합니다. 빔을 조정하는 것은 적절한 테스트 데이터베이스와 정확도 및 현재 성능 분석이없는 경우 무의미합니다. 테스트를 받으면 성능 조언을 얻기 위해 cmusphinx sourceforge 포럼에서 테스트를 공유 할 수 있습니다. FAQ 항목을 참조하십시오 : http://cmusphinx.sourceforge.net/wiki/faq#qwhy_my_accuracy_is_poor –