2012-09-15 2 views
0

일련의 카운티에 대한 고용, 세대 등의 증가 추세를 보여주는 디렉토리에있는 CSV 파일 묶음을 기반으로 데이터 구조를 채 웁니다. 행). dat 객체의 길이를 루프하고 데이터 테이블에 각 테이블을 채우고 차례대로 처리 할 수 ​​있지만 루프의 피하고 코드를 간단하게 유지하면서 훨씬 쉽게 유지할 수 있도록 lapply 패밀리의 기능에 흥미를 느낍니다. 선.다중 테이블 구조로 작업하는 가장 실질적인 방법

구조의 모든 테이블에있는 열에서 연산을 수행하고 싶습니다 (모든 열을 이름에 포함 된 연도 부분 만 바꾸거나 county_id.li8 변수를 다음과 같이 바꿀 수있는 grep 일 수 있습니다). factor에 적절한 요인 수준이 있습니다.) 두 번째 및 세 번째 수준으로 내려 가기 위해 구조를 올바르게 인덱싱하는 데 문제가 있으며 해당 열에 lapply 호출을 적용하는 데 문제가 있습니다. 예를 들어, 나는

> head(dat[1][[1]][1]) 
    county_id.i8 
1   1 
2   7 
3   17 
4   21 
5   23 
6   24 

에 의해 서브 테이블 중 하나에 county_id.li8 열을 액세스 할 수 있습니다하지만이 특정 인덱스를 연결하여 사용할 수 있습니다. 다른 말로하면, 다음과 같은 것을 어떻게 만들 수 있을까요? (여기서는 lapply를 사용하여 구조의 테이블 전체에서 반복되는 열에서 작업하는 일반적인 예와 같이 요인 수준의 문자 할당).

lapply(dat[][[]][1],factor, labels=letters[1:9]) #indices ommitted here; looking for general notation for access to the first column in all tables in structure

제대로 인덱스 수준의 몇 아래 에서 모든 수준, 그리고 궁극적으로 어떻게 구조에 포함 된 다른 테이블의 열에서 lappy 함수를 호출하는 방법에 대한 모든 포인터? (아래 데이터). 로마에있다 - 짧은 것 -

> dat<-lapply(fileList,read.csv,header=T,sep = "\t") 
> dput(dat) 
list(structure(list(county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 
28L, 35L, 38L, 39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_employment_pda_2010.f8 = c(313325, 
102645, 0, 19302, 0, 0, 9340, 0, 483207, 0, 110998, 444452, 0, 
22426, 74123, 0, 0), county_employment_pda_2011.f8 = c(313216, 
102576, 0, 19281, 0, 0, 9338, 0, 483043, 0, 110974, 445445, 0, 
22417, 74017, 0, 0), county_employment_pda_2012.f8 = c(313238, 
102504, 0, 19252, 0, 0, 9333, 0, 482906, 0, 110947, 446132, 0, 
22416, 73971, 0, 0), county_employment_pda_2013.f8 = c(313137, 
102418, 0, 19262, 0, 0, 9352, 0, 482697, 0, 110867, 447037, 0, 
22373, 73946, 0, 0), county_employment_pda_2014.f8 = c(313114, 
102399, 0, 19255, 0, 0, 9357, 0, 482469, 0, 110784, 447622, 0, 
22359, 73874, 0, 0), county_employment_pda_2015.f8 = c(312961, 
102281, 0, 19249, 0, 0, 9357, 0, 482181, 0, 110733, 448479, 0, 
22358, 73877, 0, 0), county_employment_pda_2016.f8 = c(312943, 
102273, 0, 19239, 0, 0, 9329, 0, 481983, 0, 110719, 449118, 0, 
22367, 73848, 0, 0), county_employment_pda_2017.f8 = c(312839, 
102216, 0, 19231, 0, 0, 9329, 0, 481889, 0, 110653, 450126, 0, 
22360, 73789, 0, 0), county_employment_pda_2018.f8 = c(312760, 
102188, 0, 19219, 0, 0, 9331, 0, 481795, 0, 110620, 451350, 0, 
22341, 73748, 0, 0), county_employment_pda_2019.f8 = c(312723, 
102139, 0, 19214, 0, 0, 9316, 0, 481816, 0, 110484, 452171, 0, 
22311, 73725, 0, 0), county_employment_pda_2020.f8 = c(312531, 
102094, 0, 19208, 0, 0, 9316, 0, 481812, 0, 110444, 453251, 0, 
22294, 73681, 0, 0)), .Names = c("county_id.i8", "county_employment_pda_2010.f8", 
"county_employment_pda_2011.f8", "county_employment_pda_2012.f8", 
"county_employment_pda_2013.f8", "county_employment_pda_2014.f8", 
"county_employment_pda_2015.f8", "county_employment_pda_2016.f8", 
"county_employment_pda_2017.f8", "county_employment_pda_2018.f8", 
"county_employment_pda_2019.f8", "county_employment_pda_2020.f8" 
), class = "data.frame", row.names = c(NA, -17L)), structure(list(
    county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 28L, 35L, 38L, 
    39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_employment_tpp_2010.f8 = c(450548, 
    164024, 0, 76243, 0, 0, 9280, 0, 578345, 0, 201026, 709110, 
    0, 39157, 79052, 0, 0), county_employment_tpp_2011.f8 = c(450309, 
    163934, 0, 76189, 0, 0, 9280, 0, 578065, 0, 200971, 710321, 
    0, 39154, 78941, 0, 0), county_employment_tpp_2012.f8 = c(450223, 
    163828, 0, 76125, 0, 0, 9278, 0, 577849, 0, 200922, 710952, 
    0, 39137, 78894, 0, 0), county_employment_tpp_2013.f8 = c(450078, 
    163780, 0, 76119, 0, 0, 9297, 0, 577584, 0, 200821, 712099, 
    0, 39123, 78858, 0, 0), county_employment_tpp_2014.f8 = c(449954, 
    163635, 0, 76071, 0, 0, 9297, 0, 577275, 0, 200757, 713066, 
    0, 39093, 78754, 0, 0), county_employment_tpp_2015.f8 = c(449743, 
    163455, 0, 76039, 0, 0, 9298, 0, 576946, 0, 200697, 713844, 
    0, 39095, 78671, 0, 0), county_employment_tpp_2016.f8 = c(449702, 
    163416, 0, 76012, 0, 0, 9270, 0, 576679, 0, 200621, 714573, 
    0, 39095, 78608, 0, 0), county_employment_tpp_2017.f8 = c(449493, 
    163366, 0, 75941, 0, 0, 9264, 0, 576523, 0, 200540, 715484, 
    0, 39093, 78554, 0, 0), county_employment_tpp_2018.f8 = c(449325, 
    163290, 0, 75815, 0, 0, 9266, 0, 576412, 0, 200353, 716977, 
    0, 39052, 78510, 0, 0), county_employment_tpp_2019.f8 = c(449077, 
    163186, 0, 75750, 0, 0, 9251, 0, 576354, 0, 200169, 717829, 
    0, 39018, 78487, 0, 0), county_employment_tpp_2020.f8 = c(448740, 
    163053, 0, 75704, 0, 0, 9250, 0, 576269, 0, 200122, 718708, 
    0, 39008, 78352, 0, 0)), .Names = c("county_id.i8", "county_employment_tpp_2010.f8", 
"county_employment_tpp_2011.f8", "county_employment_tpp_2012.f8", 
"county_employment_tpp_2013.f8", "county_employment_tpp_2014.f8", 
"county_employment_tpp_2015.f8", "county_employment_tpp_2016.f8", 
"county_employment_tpp_2017.f8", "county_employment_tpp_2018.f8", 
"county_employment_tpp_2019.f8", "county_employment_tpp_2020.f8" 
), class = "data.frame", row.names = c(NA, -17L)), structure(list(
    county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 28L, 35L, 38L, 
    39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_empres_pda_2010.f8 = c(201923, 
    43963, 0, 8772, 0, 0, 815, 0, 186251, 0, 78408, 160782, 0, 
    7238, 38390, 0, 0), county_empres_pda_2011.f8 = c(201783, 
    43849, 0, 8732, 0, 0, 795, 0, 213456, 0, 82832, 167626, 0, 
    7284, 37663, 0, 0), county_empres_pda_2012.f8 = c(202059, 
    44012, 0, 8742, 0, 0, 795, 0, 225552, 0, 87327, 167766, 0, 
    7498, 37518, 0, 0), county_empres_pda_2013.f8 = c(201918, 
    43878, 0, 8715, 0, 0, 789, 0, 232941, 0, 93303, 170896, 0, 
    7502, 38012, 0, 0), county_empres_pda_2014.f8 = c(209007, 
    43640, 0, 8648, 0, 0, 787, 0, 235599, 0, 96654, 174762, 0, 
    7530, 37910, 0, 0), county_empres_pda_2015.f8 = c(212050, 
    43789, 0, 8572, 0, 0, 776, 0, 234853, 0, 100111, 179057, 
    0, 7551, 37825, 0, 0), county_empres_pda_2016.f8 = c(214927, 
    43883, 0, 8531, 0, 0, 764, 0, 239730, 0, 102522, 182816, 
    0, 7518, 37677, 0, 0), county_empres_pda_2017.f8 = c(218551, 
    44331, 0, 8474, 0, 0, 764, 0, 240854, 0, 105426, 186818, 
    0, 7531, 37545, 0, 0), county_empres_pda_2018.f8 = c(220972, 
    45006, 0, 8432, 0, 0, 789, 0, 241628, 0, 107229, 190735, 
    0, 7546, 37596, 0, 0), county_empres_pda_2019.f8 = c(223044, 
    45761, 0, 8379, 0, 0, 818, 0, 244283, 0, 108506, 194185, 
    0, 7521, 37502, 0, 0), county_empres_pda_2020.f8 = c(224509, 
    46506, 0, 8394, 0, 0, 821, 0, 247482, 0, 109911, 197017, 
    0, 7504, 37591, 0, 0)), .Names = c("county_id.i8", "county_empres_pda_2010.f8", 
"county_empres_pda_2011.f8", "county_empres_pda_2012.f8", "county_empres_pda_2013.f8", 
"county_empres_pda_2014.f8", "county_empres_pda_2015.f8", "county_empres_pda_2016.f8", 
"county_empres_pda_2017.f8", "county_empres_pda_2018.f8", "county_empres_pda_2019.f8", 
"county_empres_pda_2020.f8"), class = "data.frame", row.names = c(NA, 
-17L)), structure(list(county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 
24L, 28L, 35L, 38L, 39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), 
    county_empres_tpp_2010.f8 = c(443752, 155338, 0, 65907, 0, 
    0, 2828, 0, 456214, 0, 206571, 617212, 0, 37273, 48617, 0, 
    0), county_empres_tpp_2011.f8 = c(445080, 154940, 0, 65404, 
    0, 0, 2763, 0, 487189, 0, 212888, 630774, 0, 37297, 47577, 
    0, 0), county_empres_tpp_2012.f8 = c(445075, 155693, 0, 65455, 
    0, 0, 2774, 0, 499931, 0, 219766, 628413, 0, 37542, 47263, 
    0, 0), county_empres_tpp_2013.f8 = c(444322, 155348, 0, 65132, 
    0, 0, 2741, 0, 507989, 0, 228214, 632356, 0, 37431, 47767, 
    0, 0), county_empres_tpp_2014.f8 = c(451863, 154962, 0, 64733, 
    0, 0, 2723, 0, 509617, 0, 234769, 637378, 0, 37330, 47588, 
    0, 0), county_empres_tpp_2015.f8 = c(454943, 155713, 0, 64439, 
    0, 0, 2703, 0, 506948, 0, 241141, 643996, 0, 37412, 47541, 
    0, 0), county_empres_tpp_2016.f8 = c(458014, 156027, 0, 64177, 
    0, 0, 2673, 0, 512542, 0, 245412, 649009, 0, 37347, 47498, 
    0, 0), county_empres_tpp_2017.f8 = c(462516, 156941, 0, 63885, 
    0, 0, 2662, 0, 512233, 0, 249690, 654784, 0, 37323, 47328, 
    0, 0), county_empres_tpp_2018.f8 = c(465893, 158289, 0, 63755, 
    0, 0, 2713, 0, 511619, 0, 252055, 660675, 0, 37380, 47365, 
    0, 0), county_empres_tpp_2019.f8 = c(468709, 159607, 0, 63518, 
    0, 0, 2776, 0, 513150, 0, 253851, 666054, 0, 37360, 47326, 
    0, 0), county_empres_tpp_2020.f8 = c(471780, 160499, 0, 63788, 
    0, 0, 2781, 0, 515528, 0, 255491, 669768, 0, 37373, 47399, 
    0, 0)), .Names = c("county_id.i8", "county_empres_tpp_2010.f8", 
"county_empres_tpp_2011.f8", "county_empres_tpp_2012.f8", "county_empres_tpp_2013.f8", 
"county_empres_tpp_2014.f8", "county_empres_tpp_2015.f8", "county_empres_tpp_2016.f8", 
"county_empres_tpp_2017.f8", "county_empres_tpp_2018.f8", "county_empres_tpp_2019.f8", 
"county_empres_tpp_2020.f8"), class = "data.frame", row.names = c(NA, 
-17L)), structure(list(county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 
24L, 28L, 35L, 38L, 39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), 
    county_households_pda_2010.f8 = c(170187, 37225, 0, 7006, 
    0, 0, 651, 0, 156702, 0, 53789, 111880, 0, 6479, 29142, 0, 
    0), county_households_pda_2011.f8 = c(169149, 37004, 0, 6943, 
    0, 0, 638, 0, 174237, 0, 56758, 115577, 0, 6497, 28608, 0, 
    0), county_households_pda_2012.f8 = c(169278, 37095, 0, 6955, 
    0, 0, 637, 0, 183566, 0, 60653, 115752, 0, 6634, 28529, 0, 
    0), county_households_pda_2013.f8 = c(169023, 36924, 0, 6933, 
    0, 0, 633, 0, 189987, 0, 65709, 117615, 0, 6626, 29128, 0, 
    0), county_households_pda_2014.f8 = c(174100, 36699, 0, 6889, 
    0, 0, 629, 0, 192437, 0, 68543, 120561, 0, 6643, 29129, 0, 
    0), county_households_pda_2015.f8 = c(176860, 36749, 0, 6839, 
    0, 0, 624, 0, 191746, 0, 71450, 123785, 0, 6646, 29103, 0, 
    0), county_households_pda_2016.f8 = c(179434, 36761, 0, 6812, 
    0, 0, 618, 0, 196075, 0, 73490, 126676, 0, 6616, 28999, 0, 
    0), county_households_pda_2017.f8 = c(182536, 37124, 0, 6772, 
    0, 0, 619, 0, 197431, 0, 75885, 129800, 0, 6618, 28961, 0, 
    0), county_households_pda_2018.f8 = c(184556, 37722, 0, 6736, 
    0, 0, 638, 0, 198440, 0, 77483, 132850, 0, 6620, 29037, 0, 
    0), county_households_pda_2019.f8 = c(186021, 38369, 0, 6698, 
    0, 0, 663, 0, 201268, 0, 78655, 135419, 0, 6591, 29009, 0, 
    0), county_households_pda_2020.f8 = c(187210, 38907, 0, 6717, 
    0, 0, 665, 0, 204334, 0, 79840, 137468, 0, 6569, 29120, 0, 
    0)), .Names = c("county_id.i8", "county_households_pda_2010.f8", 
"county_households_pda_2011.f8", "county_households_pda_2012.f8", 
"county_households_pda_2013.f8", "county_households_pda_2014.f8", 
"county_households_pda_2015.f8", "county_households_pda_2016.f8", 
"county_households_pda_2017.f8", "county_households_pda_2018.f8", 
"county_households_pda_2019.f8", "county_households_pda_2020.f8" 
), class = "data.frame", row.names = c(NA, -17L)), structure(list(
    county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 28L, 35L, 38L, 
    39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_households_tpp_2010.f8 = c(355208, 
    123536, 0, 50467, 0, 0, 2424, 0, 345393, 0, 138895, 406129, 
    0, 31138, 37070, 0, 0), county_households_tpp_2011.f8 = c(354126, 
    122641, 0, 49973, 0, 0, 2362, 0, 365214, 0, 143132, 413115, 
    0, 30966, 36340, 0, 0), county_households_tpp_2012.f8 = c(354044, 
    123118, 0, 50018, 0, 0, 2363, 0, 375399, 0, 149277, 411874, 
    0, 31089, 36140, 0, 0), county_households_tpp_2013.f8 = c(353169, 
    122709, 0, 49794, 0, 0, 2345, 0, 382912, 0, 156550, 414253, 
    0, 30988, 36782, 0, 0), county_households_tpp_2014.f8 = c(358483, 
    122366, 0, 49502, 0, 0, 2326, 0, 385046, 0, 162182, 418427, 
    0, 30902, 36727, 0, 0), county_households_tpp_2015.f8 = c(361320, 
    122856, 0, 49265, 0, 0, 2309, 0, 383106, 0, 167712, 423715, 
    0, 30946, 36751, 0, 0), county_households_tpp_2016.f8 = c(364010, 
    123010, 0, 49051, 0, 0, 2286, 0, 388265, 0, 171458, 427874, 
    0, 30863, 36738, 0, 0), county_households_tpp_2017.f8 = c(367824, 
    123698, 0, 48857, 0, 0, 2281, 0, 388946, 0, 175097, 432592, 
    0, 30811, 36694, 0, 0), county_households_tpp_2018.f8 = c(370694, 
    124838, 0, 48781, 0, 0, 2329, 0, 389217, 0, 177329, 437403, 
    0, 30821, 36766, 0, 0), county_households_tpp_2019.f8 = c(372758, 
    125835, 0, 48614, 0, 0, 2382, 0, 391713, 0, 179043, 441576, 
    0, 30800, 36802, 0, 0), county_households_tpp_2020.f8 = c(375254, 
    126557, 0, 48859, 0, 0, 2386, 0, 394477, 0, 180661, 444561, 
    0, 30801, 36884, 0, 0)), .Names = c("county_id.i8", "county_households_tpp_2010.f8", 
"county_households_tpp_2011.f8", "county_households_tpp_2012.f8", 
"county_households_tpp_2013.f8", "county_households_tpp_2014.f8", 
"county_households_tpp_2015.f8", "county_households_tpp_2016.f8", 
"county_households_tpp_2017.f8", "county_households_tpp_2018.f8", 
"county_households_tpp_2019.f8", "county_households_tpp_2020.f8" 
), class = "data.frame", row.names = c(NA, -17L))) 

답변

1
# I read your data into my_list 
library(plyr) 
# There is no difference between lappy and llply, except that llply is nicer 
# because it preserves names. See ?llply for more information. 

new_list <- llply(my_list, function(foo) { 
    # Notice that foo now holds the contents of each list item. 
    # so class(foo) should give you data.frame 
    # you can now access columns just by referencing foo$something 
    # rather than using an index 
    nicer_data <- melt(foo, id.vars = 1) 
    names(nicer_data) <- c("country_id", "year", "value") 
    # This returns only the year portion of the string 
    nicer_data$year <- substring(nicer_data$year,first=23, last=26) 
    return(nicer_data) 
    }) 
+0

얼마나 많은 방법을 참조하는 것이 놀랍습니다. 연습으로 우아함이 제공됩니다. 이제는 같은 시간에 더 많은 연산을 수행하기를 원한다면 (열 이름의 이름을 연도로 바꾼다. 별도의 var로 나머지를 유지하라) 동일한 llply 호출에서 실행하거나 별도의 후속 프로세스에서 채우겠습니까? – ako

+1

더 많은 작업을 원한다면'{''}'안에 넣으십시오. (내가 거기에 넣은 것을 대체하십시오). 따라서 각 작품에 대해해야 할 일은 무엇이든하고, 새로운 목록에 따르도록 돌려 보내십시오. 하나의 목록 항목에 대한 코드를 작성하고 테스트 한 다음 모든 것을 적용 할 수 있도록 함수에 입력하는 것이 쉽습니다. 행운을 빕니다! – Maiasaura

+1

중간 데이터를 저장해야하는 경우 특히'new_list'를 다른'llply' 호출을 통해 결과를 저장할 수 있습니다. 그렇지 않으면 여기에서 필요한만큼을하십시오. – Maiasaura

관련 문제