R 병렬로 사용하기

저는 멍청한 프로그래머입니다. 요소별로 나누는 데이터 프레임에 함수를 적용해야하는 코드를 작성했습니다. 데이터 프레임 자체에는 데이터 프레임을 조각 내기 위해 사용하는 변수에 64376 개의 요소가 포함 된 약 1 백만 건의 324961 관측이 포함됩니다. 다음과 같이R 병렬로 사용하기

코드는 다음과 같습니다 코드가 TL이다

library("readstata13") 
# Reading the Stata Data file into R 
bod_fb <- read.dta13("BoD_nonmissing_fb.dta") 

gen_fuzzy_blau <- function(bod_sample){ 

    # Here we drop the Variables that are not required in creating the Fuzzy-Blau index 

    bod_sample <- as.data.frame(bod_sample) 

    bod_sample$tot_occur <- as.numeric(bod_sample$tot_occur) 
    bod_sample$caste1_occ <- as.numeric(bod_sample$caste1_occ) 
    bod_sample$caste2_occ <- as.numeric(bod_sample$caste2_occ) 
    bod_sample$caste3_occ <- as.numeric(bod_sample$caste3_occ) 
    bod_sample$caste4_occ <- as.numeric(bod_sample$caste4_occ) 


    # Calculating the Probabilites of a director belonging to a caste 
    bod_sample$caste1_occ <- (bod_sample$caste1_occ)/(bod_sample$tot_occur) 
    bod_sample$caste2_occ <- (bod_sample$caste2_occ)/(bod_sample$tot_occur) 
    bod_sample$caste3_occ <- (bod_sample$caste3_occ)/(bod_sample$tot_occur) 
    bod_sample$caste4_occ <- (bod_sample$caste4_occ)/(bod_sample$tot_occur) 

    #Dropping the Total Occurances column, as we do not need it anymore 
    bod_sample$tot_occur<- NULL 

    # Here we replace all the blanks with NA 
    bod_sample <- apply(bod_sample, 2, function(x) gsub("^$|^ $", NA, x)) 
    bod_sample <- as.data.frame(bod_sample) 

    # Here we push all the NAs in the caste names and caste probabilities to the end of the row 
    # So if there are only two castes against a name, then they become caste1 and caste2 
    caste_list<-data.frame(bod_sample$caste1,bod_sample$caste2,bod_sample$caste3,bod_sample$caste4) 

    caste_list = as.data.frame(t(apply(caste_list,1, function(x) { return(c(x[!is.na(x)],x[is.na(x)]))}))) 
    caste_list_prob<-data.frame(bod_sample$caste1_occ,bod_sample$caste2_occ,bod_sample$caste3_occ,bod_sample$caste4_occ) 

    caste_list_prob = as.data.frame(t(apply(caste_list_prob,1, function(x) { return(c(x[!is.na(x)],x[is.na(x)]))}))) 

    # Here we write two functions: 1. gen_castelist 
    #        2. gen_caste_prob 
    # gen_castelist: This function takes the row number (serial number of the direcor) 
    #    and returns the names of all the castes for which he has a non-zero 
    #    probability. 
    # gen_caste_prob: This function takes the row number (serial number of the director) 
    #    and returns the probability with which he belongs to the caste 
    # 
    gen_castelist <- function(x){ 
    y <- caste_list[x,] 
    y <- as.vector(y[!is.na(y)]) 
    return(y) 
    } 

    gen_caste_prob <- function(x){ 
    z <- caste_list_prob[x,] 
    z <- z[!is.na(z)] 
    z <- as.numeric(z) 
    return(z) 
    } 

    caste_ls <-list() 
    caste_prob_ls <- list() 
    for(i in 1:nrow(bod_sample)) 
    { 
    caste_ls[[i]]<- gen_castelist(i) 
    caste_prob_ls[[i]]<- gen_caste_prob(i) 
    } 

    gridcaste <- expand.grid(caste_ls) 
    gridcaste <- data.frame(lapply(gridcaste, as.character), stringsAsFactors=FALSE) 

    gridcasteprob <- expand.grid(caste_prob_ls) 

    # Generating the Joint Probability 
    gridcasteprob$JP <- apply(gridcasteprob,1,prod) 

    # Generating the Similarity Index 
    gen_sim_index <- function(x){ 
    x <- t(x) 
    a <- as.data.frame(table(x)) 
    sim_index <- sum(a$Freq^2)/(sum(a$Freq))^2 
    return(sim_index) 
    } 
    gridcaste$sim_index <- apply(gridcaste,1,gen_sim_index) 

    # Generating fuzzyblau 
    gridcaste$fb <- gridcaste$sim_index * gridcasteprob$JP 

    fuzzy_blau_index <- sum(gridcaste$fb) 
    remove_list <- c("gridcaste","") 
    return(fuzzy_blau_index) 

} 

fuzzy_blau_output <- by(bod_fb,bod_fb$code_year,gen_fuzzy_blau) 

# Saving the output as a dataframe with two columns 
# Column 1 is the fuzzy blau index 
# Column 2 is the code_year 
code_year <- names(fuzzy_blau_output) 
fuzzy_blau <- as.data.frame(as.vector(unlist(fuzzy_blau_output))) 
names(fuzzy_blau) <- c("fuzzy_blau_index") 
fuzzy_blau$code_year <- code_year 

bod_fb <- merge(bod_fb,fuzzy_blau,by = "code_year") 
save.dta13(bod_fb,"bod_fb_example.dta")

경우, DR은, 요약은 다음과 같다 :

나는 dataframe bod_fb 있습니다. bod_fb$code_year의 요소로 데이터 프레임을 분할하여이 데이터 프레임에 gen_fuzzy_blau 함수를 적용해야합니다.

이 함수는 매우 거대하므로 순차 처리가 하루 이상 걸리고 메모리 부족으로 종료됩니다. gen_fuzzy_blau 함수는 데이터 프레임의 각 code_year에 대한 숫자 변수 fuzzy_blau_index을 반환합니다. 각 슬라이스에 기능을 적용하려면 by을 사용합니다. 이 코드를 병렬로 구현하여 여러 인스턴스의 함수가 동시에 데이터 프레임의 여러 슬라이스에서 실행되도록하는 방법이 있는지 알고 싶었습니다. 패키지에 대한 by 구현을 찾지 못했고 foreach 및 doParallel 패키지를 사용하는 동안 반복자로 데이터 프레임을 전달하는 방법을 알지 못했습니다.

저는 4GB RAM 및 Windows 7 sp1 home 기본을 갖춘 AMD A8 랩탑을 가지고 있습니다. 페이지 파일 메모리로 20GB를 부여했습니다 (메모리 오류가 발생한 후였습니다).

는

EDIT 감사 1 : 나는 코드의 중복을 제거하고있다 @milkmotel 제거 루프에 대한,하지만 기능에 gen_sim_index에서 낭비되는 많은 시간, 나는를 사용하고 proc.time() 코드의 각 부분이 차지하는 시간을 측정하는 기능.

이 함수는 행에 대해 다음과 같이 가정됩니다.행 (벡터가 아님)이라고하면 : a a b c 유사도 색인은 (2/4)^2 + (1/4)^2 + (1/4)^2 합계 (각 행의 각 고유 요소가 발생하지 않음/행에있는 요소의 총 개수 없음)^2

행에 직접 apply 함수를 사용할 수 없기 때문에 행의 각 요소가 다른 요소를 가지므로 table()이 주파수를 올바르게 출력하지 않기 때문에 각 요소가 한 행에 있습니다.

gen_sim_index 함수를 효율적으로 코딩하는 방법은 무엇입니까?

출처

2016-07-05 Venkata Srirama Krishna Teja K

6 가지 변수로 6 번 데이터를 저장합니다. 그렇게하지 마십시오.

gsub()로 말도 안되는 양의 데이터에 문자 색인을 실행하기 때문에 하루가 걸립니다.

gen_fuzzy_blau 함수에서 모든 값을 독립적으로 실행하는 것이 아니라 하나의 함수로 마무리 할 가치가 없으므로 코드를 가져와야합니다. 그런 다음 한 번에 한 줄씩 실행하십시오. 실행하는 데 너무 오래 걸리면 방법을 재고하십시오. 코드가 매우 비효율적입니다.

출처

2016-07-05 14:13:19 milkmotel

R 병렬로 사용하기

답변

관련 문제