2017-12-09 1 views
0

워드 클라우드를 만들기 전에 텍스트에서 영어 불용어를 제거하려고했지만 작동하지 않았습니다. 나는 여러 게시물을 읽고 행운이없이 제안 된 것을 시도했다. 어떤 도움을 주시면 감사하겠습니다.R 워드 클라우드 - 영어 스톱 워드를 제거 할 수 없습니다.

library(tm) 
library(wordcloud) 
library(RColorBrewer) 
library(SnowballC) 

textdata <- c(A secur breach expos privat inform of student loan borrow from Aug. 20-22 dure a comput softwar upgrade. User of the DOE Direct Loan Web site were abl to view inform other than their own if they use certain option when access the program web pages. SSNs were among the data element expos online. Softwar compani Affiliat Comput Servic (ACS) creat the technolog for the Direct Loan Servic featur on the DoE site.) 


#Create corpus and clean data 
txt <- Corpus(VectorSource(textdata)) 
txtCorpus <- tm_map(txt, removePunctuation) 
txtCorpus <- tm_map(txt, removeNumbers) 
txtCorpus <- tm_map(txt, content_transformer(tolower)) 
txtCorpus <- tm_map(txtCorpus, removeWords, stopwords("english")) 
txtCorpus <- tm_map(txt, stripWhitespace); #inspect(docs[1]) 
txtCorpus <- tm_map(txt, stemDocument) 

#Creat tdm 
tdm <- TermDocumentMatrix(txtCorpus) 
m <- as.matrix(tdm) 
v <- sort(rowSums(m),decreasing=TRUE) 
d <- data.frame(word = names(v),freq=v, stringsAsFactors = FALSE) 
head(d, 10) 

출력

 word freq 

the  the  8469   
and  and  5790   
inform inform 2629   
was  was  2487   
secur secur 2249   
were were 1901   
social social 1890  

답변

0

은 신체 클렌징 수정 :

library(tm) 
library(wordcloud) 
library(RColorBrewer) 
library(SnowballC) 
textdata <- c("A secur breach expos privat inform of student loan borrow from Aug. 20-22 dure a comput softwar upgrade. User of the DOE Direct Loan Web site were abl to view inform other than their own if they use certain option when access the program web pages. SSNs were among the data element expos online. Softwar compani Affiliat Comput Servic (ACS) creat the technolog for the Direct Loan Servic featur on the DoE site. ") 
corp <- Corpus(VectorSource(textdata)) 
corp <- tm_map(corp, removePunctuation) 
corp <- tm_map(corp, removeNumbers) 
corp <- tm_map(corp, content_transformer(tolower)) 
corp <- tm_map(corp, removeWords, stopwords("english")) 
corp <- tm_map(corp, stripWhitespace); #inspect(docs[1]) 
corp <- tm_map(corp, stemDocument) 

tdm <- TermDocumentMatrix(corp) 
m <- as.matrix(tdm) 
v <- sort(rowSums(m),decreasing=TRUE) 
d <- data.frame(word = names(v),freq=v, stringsAsFactors = FALSE) 
head(d, 10) 
#   word freq 
# loan  loan 3 
# comput comput 2 
# direct direct 2 
# doe   doe 2 
# expo  expo 2 
# inform inform 2 
# servic servic 2 
# site  site 2 
# softwar softwar 2 
# web   web 2 
관련 문제