2016-10-25 2 views
0

for 루프를 사용하여 'R'에 몇 가지 코드를 작성했지만 더 나은 r 'apply'유형을 알고 있는지 궁금한가요? 나는 아래에 '루프'해결책을 제시 할 것이다.루프 대신 apply를 사용하여 다른 디렉토리에있는 파일의 평균값을 얻는 방법

목표 - 현재 달, 이전 달 및 그 전에 이름이 지정된 디렉토리에 저장되는 동일한 수의 테이블 (동일하게 이름 지정된)이 있습니다. 그것들은 'm1'이라는 디렉토리에 저장됩니다. 이 테이블에서 3 개월 평균을 계산하고 'm3'디렉토리의 csv 파일에 다시 써야합니다.

테이블은 '키'열을 통해 몇 달 동안 관련됩니다. 레코드 세트는 여러 달 동안 비슷한 키를 많이 가지고 있지만 동일하지는 않습니다. 그래서 나는 rbind가 아닌 'merge'를 사용합니다.

셋업 및 일부 데이터 ...

set.seed(1234) 

# dirs 
if(!dir.exists("m1")){dir.create("m1")} 
if(!dir.exists("m3")){dir.create("m3")} 
if(!dir.exists("m1/201604")){dir.create("m1/201604")} 
if(!dir.exists("m1/201605")){dir.create("m1/201605")} 
if(!dir.exists("m1/201606")){dir.create("m1/201606")} 

# objects 
my_list = c("tbl_1", "tbl_2", "tbl_3") 
month_list = c("201604", "201605", "201606") 
my_files_paths = lapply(my_list, function(x) paste0("m1/", month_list, "/", x ,".csv")) 

keys = replicate(100, paste0(sample(letters, 5), collapse = "")) 


# some dummy data 
# for 201604 
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T)) 
write.table(tbl_1, "m1/201604/tbl_1.csv", col.names = T, row.names = F, sep = ",") 
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T)) 
write.table(tbl_2, "m1/201604/tbl_2.csv", col.names = T, row.names = F, sep = ",") 
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T)) 
write.table(tbl_3, "m1/201604/tbl_3.csv", col.names = T, row.names = F, sep = ",") 

# for 201605 
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T)) 
write.table(tbl_1, "m1/201605/tbl_1.csv", col.names = T, row.names = F, sep = ",") 
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T)) 
write.table(tbl_2, "m1/201605/tbl_2.csv", col.names = T, row.names = F, sep = ",") 
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T)) 
write.table(tbl_3, "m1/201605/tbl_3.csv", col.names = T, row.names = F, sep = ",") 

# for 201606 
tbl_1 = data.frame(key = sample(keys, 90, replace = F), count_abc = sample(1:10, 90, replace = T), amount_abc = sample(1:30, 90, replace = T)) 
write.table(tbl_1, "m1/201606/tbl_1.csv", col.names = T, row.names = F, sep = ",") 
tbl_2 = data.frame(key = sample(keys, 90, replace = F), count_def = sample(1:10, 90, replace = T), amount_def = sample(1:30, 90, replace = T)) 
write.table(tbl_2, "m1/201606/tbl_2.csv", col.names = T, row.names = F, sep = ",") 
tbl_3 = data.frame(key = sample(keys, 90, replace = F), count_ghi = sample(1:10, 90, replace = T), amount_ghi = sample(1:30, 90, replace = T)) 
write.table(tbl_3, "m1/201606/tbl_3.csv", col.names = T, row.names = F, sep = ",") 


# I am trying to merge the 'same named csvs' from dirs '201604', '201605' and '201606' 
# and get the averages for the "identical' columns in each month's dataframes 

rm(month_list, my_list, tbl_1, tbl_2, tbl_3, keys) 

답변

0
# My 'loop' version answer... :) 
# Does anyone have a better 'r/apply family' method 

# NB - stringr needs to be installed 


for(tbl in my_files_paths) 
{ 

    # read the files in across the 3 months and merge 
    my_tbl = lapply(tbl, function(x) read.table(x, header = T, stringsAsFactors = F, sep = ",")) 
    merged.tbl = Reduce(function(...) merge(..., by = "key", all = T), my_tbl) 

    # set the key to rownames and remove the key column to leave only columns of interest 
    row.names(merged.tbl) = merged.tbl$key 
    merged.tbl$key = NULL 

    # R appends '.x' and '.y' to duplicate column names, so these are cleaned off 
    colnames(merged.tbl) = stringr::str_replace_all(colnames(merged.tbl), "\\.[a-z]", "") 

    # unique column names are established to help subset 
    col_names_merged_tbls = colnames(merged.tbl) 
    col_names_merged_tbls = unique(col_names_merged_tbls) 

    # subset the similar columns and derive the mean 
    # store the means from each iteration in 'empty_vessel' and merge the results 
    for(nme in col_names_merged_tbls) 
    { 
    sub_tbl = merged.tbl[, names(merged.tbl) == nme] 
    sub_tbl[,1] = round(apply(sub_tbl, 1, mean, na.rm = T),2) 
    sub_tbl = sub_tbl[,1, drop = F] 
    # bring 'key' back into play to facilitate the merge 
    sub_tbl$key = row.names(sub_tbl) 
    if(!exists("empty_vessel")){empty_vessel <- list()} 
    empty_vessel[[nme]] = sub_tbl 

    new_tbl = Reduce(function(...) merge(..., by = "key", all = T), empty_vessel) 

    } 
    rm(empty_vessel) 

    # set the file_path up to write to 
    file_path = stringr::str_replace_all(tbl[1], "m1/[0-9]*/", "") 

    # write each new table with 3 month means to 'm3' 
    write.table(new_tbl, paste0("m3/", file_path), sep = ",", col.names = T, row.names = F) 

    # cleanup 
    rm(col_names_merged_tbls, file_path, merged.tbl, my_tbl, new_tbl, nme, sub_tbl, tbl) 

} 

rm(my_files_paths) 
관련 문제