library("nycflights13")
library("tidyverse")
探索性数据分析
可视化和数据转换的方法实现探索数据,统计学家称为是一个探索性数据分析(exploratory data analysis,EDA) 同时这是一个循环可迭代的问题: - 对数据提出问题 - 对数据进行可视化 - 使用上一个步骤的结果来精炼问题,并提出新问题。
准备工作
EDA本质是一个创造的过程,问题的质量在于问题的数量。定义几个变量: - 变量 - 值 - 观测 - 表格数据
ggplot(data= diamonds)+
geom_bar(mapping=aes(x=cut))
%>%count(cut) diamonds
cut | n |
---|---|
<ord> | <int> |
Fair | 1610 |
Good | 4906 |
Very Good | 12082 |
Premium | 13791 |
Ideal | 21551 |
通过dplyr::count()
手动计算结果。
使用binwidth
参数设定直方图中的间隔的宽度,参数是用x轴变量的单位来衡量的。使用直方图时候,试验不同的宽度对于其的影响。
::filter(flights,month==1,day==1) dplyr
year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <dbl> | <int> | <int> | <dbl> | <chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dttm> |
2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 555 | 600 | -5 | 913 | 854 | 19 | B6 | 507 | N516JB | EWR | FLL | 158 | 1065 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 557 | 600 | -3 | 709 | 723 | -14 | EV | 5708 | N829AS | LGA | IAD | 53 | 229 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 557 | 600 | -3 | 838 | 846 | -8 | B6 | 79 | N593JB | JFK | MCO | 140 | 944 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 753 | 745 | 8 | AA | 301 | N3ALAA | LGA | ORD | 138 | 733 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 849 | 851 | -2 | B6 | 49 | N793JB | JFK | PBI | 149 | 1028 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 853 | 856 | -3 | B6 | 71 | N657JB | JFK | TPA | 158 | 1005 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 924 | 917 | 7 | UA | 194 | N29129 | JFK | LAX | 345 | 2475 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 923 | 937 | -14 | UA | 1124 | N53441 | EWR | SFO | 361 | 2565 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 559 | 600 | -1 | 941 | 910 | 31 | AA | 707 | N3DUAA | LGA | DFW | 257 | 1389 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 559 | 559 | 0 | 702 | 706 | -4 | B6 | 1806 | N708JB | JFK | BOS | 44 | 187 | 5 | 59 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 559 | 600 | -1 | 854 | 902 | -8 | UA | 1187 | N76515 | EWR | LAS | 337 | 2227 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 600 | 600 | 0 | 851 | 858 | -7 | B6 | 371 | N595JB | LGA | FLL | 152 | 1076 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 600 | 600 | 0 | 837 | 825 | 12 | MQ | 4650 | N542MQ | LGA | ATL | 134 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 601 | 600 | 1 | 844 | 850 | -6 | B6 | 343 | N644JB | EWR | PBI | 147 | 1023 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 602 | 610 | -8 | 812 | 820 | -8 | DL | 1919 | N971DL | LGA | MSP | 170 | 1020 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 602 | 605 | -3 | 821 | 805 | 16 | MQ | 4401 | N730MQ | LGA | DTW | 105 | 502 | 6 | 5 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 606 | 610 | -4 | 858 | 910 | -12 | AA | 1895 | N633AA | EWR | MIA | 152 | 1085 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 606 | 610 | -4 | 837 | 845 | -8 | DL | 1743 | N3739P | JFK | ATL | 128 | 760 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 607 | 607 | 0 | 858 | 915 | -17 | UA | 1077 | N53442 | EWR | MIA | 157 | 1085 | 6 | 7 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 608 | 600 | 8 | 807 | 735 | 32 | MQ | 3768 | N9EAMQ | EWR | ORD | 139 | 719 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 611 | 600 | 11 | 945 | 931 | 14 | UA | 303 | N532UA | JFK | SFO | 366 | 2586 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 613 | 610 | 3 | 925 | 921 | 4 | B6 | 135 | N635JB | JFK | RSW | 175 | 1074 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 615 | 615 | 0 | 1039 | 1100 | -21 | B6 | 709 | N794JB | JFK | SJU | 182 | 1598 | 6 | 15 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 615 | 615 | 0 | 833 | 842 | -9 | DL | 575 | N326NB | EWR | ATL | 120 | 746 | 6 | 15 | 2013-01-01 06:00:00 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 1 | 1 | 2140 | 2135 | 5 | 210 | 224 | -14 | B6 | 701 | N284JB | JFK | SJU | 189 | 1598 | 21 | 35 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2157 | 2155 | 2 | 43 | 41 | 2 | B6 | 43 | N537JB | JFK | MCO | 140 | 944 | 21 | 55 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2158 | 2200 | -2 | 2254 | 2307 | -13 | EV | 4103 | N14998 | EWR | BWI | 36 | 169 | 22 | 0 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2205 | 1720 | 285 | 46 | 2040 | 246 | AA | 1999 | N5DNAA | EWR | MIA | 146 | 1085 | 17 | 20 | 2013-01-01 17:00:00 |
2013 | 1 | 1 | 2209 | 2145 | 24 | 58 | 37 | 21 | B6 | 35 | N608JB | JFK | PBI | 143 | 1028 | 21 | 45 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2209 | 2155 | 14 | 2400 | 2337 | 23 | B6 | 1109 | N216JB | JFK | RDU | 86 | 427 | 21 | 55 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2211 | 2145 | 26 | 2339 | 2311 | 28 | B6 | 104 | N228JB | JFK | BUF | 64 | 301 | 21 | 45 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2217 | 2229 | -12 | 249 | 315 | -26 | B6 | 713 | N547JB | JFK | SJU | 191 | 1598 | 22 | 29 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2217 | 2130 | 47 | 140 | 27 | 73 | B6 | 21 | N516JB | JFK | TPA | 163 | 1005 | 21 | 30 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2221 | 2000 | 141 | 2331 | 2124 | 127 | EV | 4462 | N13566 | EWR | BUF | 56 | 282 | 20 | 0 | 2013-01-01 20:00:00 |
2013 | 1 | 1 | 2224 | 2200 | 24 | 2324 | 2316 | 8 | EV | 4206 | N16561 | EWR | PWM | 47 | 284 | 22 | 0 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2229 | 2159 | 30 | 149 | 100 | 49 | B6 | 11 | N531JB | JFK | FLL | 153 | 1069 | 21 | 59 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2240 | 2245 | -5 | 2340 | 2356 | -16 | B6 | 608 | N279JB | JFK | PWM | 44 | 273 | 22 | 45 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2250 | 2255 | -5 | 2352 | 2359 | -7 | B6 | 1018 | N521JB | JFK | BOS | 37 | 187 | 22 | 55 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2302 | 2200 | 62 | 2342 | 2253 | 49 | EV | 4276 | N13903 | EWR | BDL | 24 | 116 | 22 | 0 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2306 | 2245 | 21 | 28 | 5 | 23 | B6 | 30 | N281JB | JFK | ROC | 59 | 264 | 22 | 45 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2307 | 2245 | 22 | 32 | 2357 | 35 | B6 | 128 | N178JB | JFK | BTV | 59 | 266 | 22 | 45 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2310 | 2255 | 15 | 24 | 15 | 9 | B6 | 112 | N646JB | JFK | BUF | 57 | 301 | 22 | 55 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2312 | 2000 | 192 | 21 | 2110 | 191 | EV | 4312 | N13958 | EWR | DCA | 44 | 199 | 20 | 0 | 2013-01-01 20:00:00 |
2013 | 1 | 1 | 2323 | 2200 | 83 | 22 | 2313 | 69 | EV | 4257 | N13538 | EWR | BTV | 44 | 266 | 22 | 0 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2326 | 2130 | 116 | 131 | 18 | 73 | B6 | 199 | N594JB | JFK | LAS | 290 | 2248 | 21 | 30 | 2013-01-01 21:00:00 |
2013 | 1 | 1 | 2327 | 2250 | 37 | 32 | 2359 | 33 | B6 | 22 | N639JB | JFK | SYR | 45 | 209 | 22 | 50 | 2013-01-01 22:00:00 |
2013 | 1 | 1 | 2343 | 1724 | 379 | 314 | 1938 | 456 | EV | 4321 | N21197 | EWR | MCI | 222 | 1092 | 17 | 24 | 2013-01-01 17:00:00 |
2013 | 1 | 1 | 2353 | 2359 | -6 | 425 | 445 | -20 | B6 | 739 | N591JB | JFK | PSE | 195 | 1617 | 23 | 59 | 2013-01-01 23:00:00 |
2013 | 1 | 1 | 2353 | 2359 | -6 | 418 | 442 | -24 | B6 | 707 | N794JB | JFK | SJU | 185 | 1598 | 23 | 59 | 2013-01-01 23:00:00 |
2013 | 1 | 1 | 2356 | 2359 | -3 | 425 | 437 | -12 | B6 | 727 | N588JB | JFK | BQN | 186 | 1576 | 23 | 59 | 2013-01-01 23:00:00 |
2013 | 1 | 1 | NA | 1630 | NA | NA | 1815 | NA | EV | 4308 | N18120 | EWR | RDU | NA | 416 | 16 | 30 | 2013-01-01 16:00:00 |
2013 | 1 | 1 | NA | 1935 | NA | NA | 2240 | NA | AA | 791 | N3EHAA | LGA | DFW | NA | 1389 | 19 | 35 | 2013-01-01 19:00:00 |
2013 | 1 | 1 | NA | 1500 | NA | NA | 1825 | NA | AA | 1925 | N3EVAA | LGA | MIA | NA | 1096 | 15 | 0 | 2013-01-01 15:00:00 |
2013 | 1 | 1 | NA | 600 | NA | NA | 901 | NA | B6 | 125 | N618JB | JFK | FLL | NA | 1069 | 6 | 0 | 2013-01-01 06:00:00 |
<- diamonds%>%dplyr::filter(carat<3) smaller
arrange(flights,year,month,day)
year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <dbl> | <int> | <int> | <dbl> | <chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dttm> |
2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 555 | 600 | -5 | 913 | 854 | 19 | B6 | 507 | N516JB | EWR | FLL | 158 | 1065 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 557 | 600 | -3 | 709 | 723 | -14 | EV | 5708 | N829AS | LGA | IAD | 53 | 229 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 557 | 600 | -3 | 838 | 846 | -8 | B6 | 79 | N593JB | JFK | MCO | 140 | 944 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 753 | 745 | 8 | AA | 301 | N3ALAA | LGA | ORD | 138 | 733 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 849 | 851 | -2 | B6 | 49 | N793JB | JFK | PBI | 149 | 1028 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 853 | 856 | -3 | B6 | 71 | N657JB | JFK | TPA | 158 | 1005 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 924 | 917 | 7 | UA | 194 | N29129 | JFK | LAX | 345 | 2475 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 923 | 937 | -14 | UA | 1124 | N53441 | EWR | SFO | 361 | 2565 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 559 | 600 | -1 | 941 | 910 | 31 | AA | 707 | N3DUAA | LGA | DFW | 257 | 1389 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 559 | 559 | 0 | 702 | 706 | -4 | B6 | 1806 | N708JB | JFK | BOS | 44 | 187 | 5 | 59 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 559 | 600 | -1 | 854 | 902 | -8 | UA | 1187 | N76515 | EWR | LAS | 337 | 2227 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 600 | 600 | 0 | 851 | 858 | -7 | B6 | 371 | N595JB | LGA | FLL | 152 | 1076 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 600 | 600 | 0 | 837 | 825 | 12 | MQ | 4650 | N542MQ | LGA | ATL | 134 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 601 | 600 | 1 | 844 | 850 | -6 | B6 | 343 | N644JB | EWR | PBI | 147 | 1023 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 602 | 610 | -8 | 812 | 820 | -8 | DL | 1919 | N971DL | LGA | MSP | 170 | 1020 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 602 | 605 | -3 | 821 | 805 | 16 | MQ | 4401 | N730MQ | LGA | DTW | 105 | 502 | 6 | 5 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 606 | 610 | -4 | 858 | 910 | -12 | AA | 1895 | N633AA | EWR | MIA | 152 | 1085 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 606 | 610 | -4 | 837 | 845 | -8 | DL | 1743 | N3739P | JFK | ATL | 128 | 760 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 607 | 607 | 0 | 858 | 915 | -17 | UA | 1077 | N53442 | EWR | MIA | 157 | 1085 | 6 | 7 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 608 | 600 | 8 | 807 | 735 | 32 | MQ | 3768 | N9EAMQ | EWR | ORD | 139 | 719 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 611 | 600 | 11 | 945 | 931 | 14 | UA | 303 | N532UA | JFK | SFO | 366 | 2586 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 613 | 610 | 3 | 925 | 921 | 4 | B6 | 135 | N635JB | JFK | RSW | 175 | 1074 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 615 | 615 | 0 | 1039 | 1100 | -21 | B6 | 709 | N794JB | JFK | SJU | 182 | 1598 | 6 | 15 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 615 | 615 | 0 | 833 | 842 | -9 | DL | 575 | N326NB | EWR | ATL | 120 | 746 | 6 | 15 | 2013-01-01 06:00:00 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 12 | 31 | 2155 | 2039 | 76 | 253 | 2355 | NA | B6 | 1205 | N627JB | JFK | PDX | NA | 2454 | 20 | 39 | 2013-12-31 20:00:00 |
2013 | 12 | 31 | 2155 | 2150 | 5 | 110 | 51 | 19 | B6 | 1901 | N729JB | JFK | FLL | 164 | 1069 | 21 | 50 | 2013-12-31 21:00:00 |
2013 | 12 | 31 | 2159 | 2155 | 4 | 55 | 46 | 9 | B6 | 2053 | N593JB | JFK | PBI | 155 | 1028 | 21 | 55 | 2013-12-31 21:00:00 |
2013 | 12 | 31 | 2206 | 2110 | 56 | 44 | 2339 | 65 | B6 | 775 | N184JB | JFK | MSY | 195 | 1182 | 21 | 10 | 2013-12-31 21:00:00 |
2013 | 12 | 31 | 2211 | 2159 | 12 | 100 | 45 | 15 | B6 | 1183 | N715JB | JFK | MCO | 148 | 944 | 21 | 59 | 2013-12-31 21:00:00 |
2013 | 12 | 31 | 2218 | 2219 | -1 | 315 | 304 | 11 | B6 | 1203 | N625JB | JFK | SJU | 202 | 1598 | 22 | 19 | 2013-12-31 22:00:00 |
2013 | 12 | 31 | 2235 | 2245 | -10 | 2351 | 2355 | -4 | B6 | 234 | N355JB | JFK | BTV | 49 | 266 | 22 | 45 | 2013-12-31 22:00:00 |
2013 | 12 | 31 | 2245 | 2250 | -5 | 2359 | 2356 | 3 | B6 | 1816 | N318JB | JFK | SYR | 51 | 209 | 22 | 50 | 2013-12-31 22:00:00 |
2013 | 12 | 31 | 2310 | 2255 | 15 | 7 | 2356 | 11 | B6 | 718 | N279JB | JFK | BOS | 40 | 187 | 22 | 55 | 2013-12-31 22:00:00 |
2013 | 12 | 31 | 2321 | 2250 | 31 | 46 | 8 | 38 | B6 | 2002 | N179JB | JFK | BUF | 66 | 301 | 22 | 50 | 2013-12-31 22:00:00 |
2013 | 12 | 31 | 2328 | 2330 | -2 | 412 | 409 | 3 | B6 | 1389 | N651JB | EWR | SJU | 198 | 1608 | 23 | 30 | 2013-12-31 23:00:00 |
2013 | 12 | 31 | 2332 | 2245 | 47 | 58 | 3 | 55 | B6 | 486 | N334JB | JFK | ROC | 60 | 264 | 22 | 45 | 2013-12-31 22:00:00 |
2013 | 12 | 31 | 2355 | 2359 | -4 | 430 | 440 | -10 | B6 | 1503 | N509JB | JFK | SJU | 195 | 1598 | 23 | 59 | 2013-12-31 23:00:00 |
2013 | 12 | 31 | 2356 | 2359 | -3 | 436 | 445 | -9 | B6 | 745 | N665JB | JFK | PSE | 200 | 1617 | 23 | 59 | 2013-12-31 23:00:00 |
2013 | 12 | 31 | NA | 1520 | NA | NA | 1705 | NA | AA | 341 | N568AA | LGA | ORD | NA | 733 | 15 | 20 | 2013-12-31 15:00:00 |
2013 | 12 | 31 | NA | 2025 | NA | NA | 2205 | NA | AA | 371 | N482AA | LGA | ORD | NA | 733 | 20 | 25 | 2013-12-31 20:00:00 |
2013 | 12 | 31 | NA | 1932 | NA | NA | 2305 | NA | B6 | 161 | N516JB | JFK | SMF | NA | 2521 | 19 | 32 | 2013-12-31 19:00:00 |
2013 | 12 | 31 | NA | 1505 | NA | NA | 1725 | NA | EV | 4181 | N24103 | EWR | MCI | NA | 1092 | 15 | 5 | 2013-12-31 15:00:00 |
2013 | 12 | 31 | NA | 1000 | NA | NA | 1252 | NA | UA | 1124 | NA | EWR | EGE | NA | 1725 | 10 | 0 | 2013-12-31 10:00:00 |
2013 | 12 | 31 | NA | 840 | NA | NA | 1205 | NA | UA | 1151 | NA | EWR | SEA | NA | 2402 | 8 | 40 | 2013-12-31 08:00:00 |
2013 | 12 | 31 | NA | 754 | NA | NA | 1118 | NA | UA | 1455 | NA | EWR | LAX | NA | 2454 | 7 | 54 | 2013-12-31 07:00:00 |
2013 | 12 | 31 | NA | 2000 | NA | NA | 2146 | NA | UA | 1482 | NA | EWR | ORD | NA | 719 | 20 | 0 | 2013-12-31 20:00:00 |
2013 | 12 | 31 | NA | 1500 | NA | NA | 1817 | NA | UA | 1483 | NA | EWR | AUS | NA | 1504 | 15 | 0 | 2013-12-31 15:00:00 |
2013 | 12 | 31 | NA | 1430 | NA | NA | 1750 | NA | UA | 1493 | NA | EWR | LAX | NA | 2454 | 14 | 30 | 2013-12-31 14:00:00 |
2013 | 12 | 31 | NA | 855 | NA | NA | 1142 | NA | UA | 1506 | NA | EWR | JAC | NA | 1874 | 8 | 55 | 2013-12-31 08:00:00 |
2013 | 12 | 31 | NA | 705 | NA | NA | 931 | NA | UA | 1729 | NA | EWR | DEN | NA | 1605 | 7 | 5 | 2013-12-31 07:00:00 |
2013 | 12 | 31 | NA | 825 | NA | NA | 1029 | NA | US | 1831 | NA | JFK | CLT | NA | 541 | 8 | 25 | 2013-12-31 08:00:00 |
2013 | 12 | 31 | NA | 1615 | NA | NA | 1800 | NA | MQ | 3301 | N844MQ | LGA | RDU | NA | 431 | 16 | 15 | 2013-12-31 16:00:00 |
2013 | 12 | 31 | NA | 600 | NA | NA | 735 | NA | UA | 219 | NA | EWR | ORD | NA | 719 | 6 | 0 | 2013-12-31 06:00:00 |
2013 | 12 | 31 | NA | 830 | NA | NA | 1154 | NA | UA | 443 | NA | JFK | LAX | NA | 2475 | 8 | 30 | 2013-12-31 08:00:00 |
arrange()
的工作方式与filter()
函数相似,前者不是选择行,而是改变行的顺序。
它接受的一个数据框和一组作为排序依据的列名作为参数。
desc()
降序排
arrange(flights,desc(arr_delay))
year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <dbl> | <int> | <int> | <dbl> | <chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dttm> |
2013 | 1 | 9 | 641 | 900 | 1301 | 1242 | 1530 | 1272 | HA | 51 | N384HA | JFK | HNL | 640 | 4983 | 9 | 0 | 2013-01-09 09:00:00 |
2013 | 6 | 15 | 1432 | 1935 | 1137 | 1607 | 2120 | 1127 | MQ | 3535 | N504MQ | JFK | CMH | 74 | 483 | 19 | 35 | 2013-06-15 19:00:00 |
2013 | 1 | 10 | 1121 | 1635 | 1126 | 1239 | 1810 | 1109 | MQ | 3695 | N517MQ | EWR | ORD | 111 | 719 | 16 | 35 | 2013-01-10 16:00:00 |
2013 | 9 | 20 | 1139 | 1845 | 1014 | 1457 | 2210 | 1007 | AA | 177 | N338AA | JFK | SFO | 354 | 2586 | 18 | 45 | 2013-09-20 18:00:00 |
2013 | 7 | 22 | 845 | 1600 | 1005 | 1044 | 1815 | 989 | MQ | 3075 | N665MQ | JFK | CVG | 96 | 589 | 16 | 0 | 2013-07-22 16:00:00 |
2013 | 4 | 10 | 1100 | 1900 | 960 | 1342 | 2211 | 931 | DL | 2391 | N959DL | JFK | TPA | 139 | 1005 | 19 | 0 | 2013-04-10 19:00:00 |
2013 | 3 | 17 | 2321 | 810 | 911 | 135 | 1020 | 915 | DL | 2119 | N927DA | LGA | MSP | 167 | 1020 | 8 | 10 | 2013-03-17 08:00:00 |
2013 | 7 | 22 | 2257 | 759 | 898 | 121 | 1026 | 895 | DL | 2047 | N6716C | LGA | ATL | 109 | 762 | 7 | 59 | 2013-07-22 07:00:00 |
2013 | 12 | 5 | 756 | 1700 | 896 | 1058 | 2020 | 878 | AA | 172 | N5DMAA | EWR | MIA | 149 | 1085 | 17 | 0 | 2013-12-05 17:00:00 |
2013 | 5 | 3 | 1133 | 2055 | 878 | 1250 | 2215 | 875 | MQ | 3744 | N523MQ | EWR | ORD | 112 | 719 | 20 | 55 | 2013-05-03 20:00:00 |
2013 | 12 | 14 | 830 | 1845 | 825 | 1210 | 2154 | 856 | DL | 2391 | N939DL | JFK | TPA | 173 | 1005 | 18 | 45 | 2013-12-14 18:00:00 |
2013 | 5 | 19 | 713 | 1700 | 853 | 1007 | 1955 | 852 | AA | 257 | N3HEAA | JFK | LAS | 323 | 2248 | 17 | 0 | 2013-05-19 17:00:00 |
2013 | 1 | 1 | 848 | 1835 | 853 | 1001 | 1950 | 851 | MQ | 3944 | N942MQ | JFK | BWI | 41 | 184 | 18 | 35 | 2013-01-01 18:00:00 |
2013 | 6 | 27 | 959 | 1900 | 899 | 1236 | 2226 | 850 | DL | 2007 | N3762Y | JFK | PDX | 313 | 2454 | 19 | 0 | 2013-06-27 19:00:00 |
2013 | 12 | 19 | 734 | 1725 | 849 | 1046 | 2039 | 847 | DL | 1223 | N375NC | EWR | SLC | 290 | 1969 | 17 | 25 | 2013-12-19 17:00:00 |
2013 | 12 | 17 | 705 | 1700 | 845 | 1026 | 2020 | 846 | AA | 172 | N5EMAA | EWR | MIA | 145 | 1085 | 17 | 0 | 2013-12-17 17:00:00 |
2013 | 2 | 10 | 2243 | 830 | 853 | 100 | 1106 | 834 | F9 | 835 | N203FR | LGA | DEN | 233 | 1620 | 8 | 30 | 2013-02-10 08:00:00 |
2013 | 4 | 19 | 912 | 1940 | 812 | 1228 | 2247 | 821 | DL | 1435 | N900DE | LGA | TPA | 174 | 1010 | 19 | 40 | 2013-04-19 19:00:00 |
2013 | 6 | 27 | 753 | 1830 | 803 | 937 | 2015 | 802 | AA | 2019 | N571AA | LGA | STL | 134 | 888 | 18 | 30 | 2013-06-27 18:00:00 |
2013 | 11 | 3 | 603 | 1645 | 798 | 829 | 1913 | 796 | DL | 2042 | N990AT | EWR | ATL | 109 | 746 | 16 | 45 | 2013-11-03 16:00:00 |
2013 | 3 | 18 | 1020 | 2100 | 800 | 1336 | 32 | 784 | DL | 2363 | N624AG | JFK | LAX | 335 | 2475 | 21 | 0 | 2013-03-18 21:00:00 |
2013 | 4 | 19 | 606 | 1725 | 761 | 923 | 2020 | 783 | AA | 1901 | N3DGAA | JFK | IAH | 222 | 1417 | 17 | 25 | 2013-04-19 17:00:00 |
2013 | 4 | 19 | 617 | 1700 | 797 | 858 | 1955 | 783 | AA | 257 | N3GJAA | JFK | LAS | 313 | 2248 | 17 | 0 | 2013-04-19 17:00:00 |
2013 | 6 | 27 | 732 | 1825 | 787 | 932 | 2032 | 780 | DL | 1715 | N335NB | LGA | MSY | 160 | 1183 | 18 | 25 | 2013-06-27 18:00:00 |
2013 | 2 | 24 | 1921 | 615 | 786 | 2135 | 842 | 773 | DL | 575 | N348NW | EWR | ATL | 111 | 746 | 6 | 15 | 2013-02-24 06:00:00 |
2013 | 6 | 27 | 615 | 1705 | 790 | 853 | 2004 | 769 | DL | 503 | N372DA | JFK | SAN | 312 | 2446 | 17 | 5 | 2013-06-27 17:00:00 |
2013 | 2 | 19 | 2324 | 1016 | 788 | 114 | 1227 | 767 | DL | 2319 | N324US | LGA | MSP | 136 | 1020 | 10 | 16 | 2013-02-19 10:00:00 |
2013 | 2 | 16 | 757 | 1930 | 747 | 1013 | 2149 | 744 | 9E | 3798 | N8940E | JFK | CLT | 85 | 541 | 19 | 30 | 2013-02-16 19:00:00 |
2013 | 4 | 19 | 758 | 1925 | 753 | 1049 | 2225 | 744 | DL | 1485 | N927DA | LGA | MCO | 149 | 950 | 19 | 25 | 2013-04-19 19:00:00 |
2013 | 10 | 14 | 2042 | 900 | 702 | 2255 | 1127 | 688 | DL | 502 | N943DL | EWR | ATL | 98 | 746 | 9 | 0 | 2013-10-14 09:00:00 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 9 | 25 | NA | 1655 | NA | NA | 1840 | NA | MQ | 3411 | N735MQ | LGA | RDU | NA | 431 | 16 | 55 | 2013-09-25 16:00:00 |
2013 | 9 | 25 | NA | 1559 | NA | NA | 1719 | NA | MQ | 3748 | N530MQ | EWR | ORD | NA | 719 | 15 | 59 | 2013-09-25 15:00:00 |
2013 | 9 | 26 | 1331 | 1329 | 2 | 1923 | 1813 | NA | UA | 15 | N67052 | EWR | HNL | NA | 4963 | 13 | 29 | 2013-09-26 13:00:00 |
2013 | 9 | 26 | NA | 915 | NA | NA | 1141 | NA | EV | 5109 | N748EV | LGA | CHS | NA | 641 | 9 | 15 | 2013-09-26 09:00:00 |
2013 | 9 | 26 | NA | 1400 | NA | NA | 1512 | NA | US | 2183 | NA | LGA | DCA | NA | 214 | 14 | 0 | 2013-09-26 14:00:00 |
2013 | 9 | 26 | NA | 1240 | NA | NA | 1525 | NA | WN | 4720 | N691WN | EWR | HOU | NA | 1411 | 12 | 40 | 2013-09-26 12:00:00 |
2013 | 9 | 27 | 1332 | 1329 | 3 | 1629 | 1509 | NA | AA | 331 | N565AA | LGA | ORD | NA | 733 | 13 | 29 | 2013-09-27 13:00:00 |
2013 | 9 | 27 | 2253 | 1945 | 188 | NA | 2146 | NA | EV | 5306 | N605QX | LGA | GSO | NA | 461 | 19 | 45 | 2013-09-27 19:00:00 |
2013 | 9 | 27 | NA | 600 | NA | NA | 730 | NA | AA | 301 | N584AA | LGA | ORD | NA | 733 | 6 | 0 | 2013-09-27 06:00:00 |
2013 | 9 | 27 | NA | 2100 | NA | NA | 2211 | NA | US | 2164 | NA | LGA | BOS | NA | 184 | 21 | 0 | 2013-09-27 21:00:00 |
2013 | 9 | 27 | NA | 1329 | NA | NA | 1444 | NA | MQ | 3760 | N505MQ | EWR | ORD | NA | 719 | 13 | 29 | 2013-09-27 13:00:00 |
2013 | 9 | 27 | NA | 1600 | NA | NA | 1739 | NA | UA | 269 | NA | LGA | ORD | NA | 733 | 16 | 0 | 2013-09-27 16:00:00 |
2013 | 9 | 28 | 555 | 600 | -5 | 953 | 753 | NA | EV | 5068 | N133EV | EWR | DTW | NA | 488 | 6 | 0 | 2013-09-28 06:00:00 |
2013 | 9 | 28 | 847 | 839 | 8 | 1130 | 959 | NA | EV | 4510 | N14542 | EWR | MKE | NA | 725 | 8 | 39 | 2013-09-28 08:00:00 |
2013 | 9 | 28 | 1010 | 1020 | -10 | 1344 | 1222 | NA | EV | 4412 | N12175 | EWR | DSM | NA | 1017 | 10 | 20 | 2013-09-28 10:00:00 |
2013 | 9 | 28 | 1214 | 1225 | -11 | 1801 | 1510 | NA | AA | 300 | N488AA | EWR | DFW | NA | 1372 | 12 | 25 | 2013-09-28 12:00:00 |
2013 | 9 | 28 | NA | 1803 | NA | NA | 1927 | NA | EV | 5563 | N724EV | LGA | BTV | NA | 258 | 18 | 3 | 2013-09-28 18:00:00 |
2013 | 9 | 28 | NA | 910 | NA | NA | 1220 | NA | AA | 1 | N320AA | JFK | LAX | NA | 2475 | 9 | 10 | 2013-09-28 09:00:00 |
2013 | 9 | 28 | NA | 1635 | NA | NA | 1827 | NA | US | 581 | NA | EWR | CLT | NA | 529 | 16 | 35 | 2013-09-28 16:00:00 |
2013 | 9 | 29 | 1734 | 1711 | 23 | 2159 | 2020 | NA | UA | 327 | N463UA | EWR | PDX | NA | 2434 | 17 | 11 | 2013-09-29 17:00:00 |
2013 | 9 | 29 | NA | 2054 | NA | NA | 2302 | NA | EV | 4536 | N13988 | EWR | CVG | NA | 569 | 20 | 54 | 2013-09-29 20:00:00 |
2013 | 9 | 29 | NA | 1830 | NA | NA | 2010 | NA | MQ | 3134 | N508MQ | EWR | ORD | NA | 719 | 18 | 30 | 2013-09-29 18:00:00 |
2013 | 9 | 29 | NA | 700 | NA | NA | 833 | NA | UA | 331 | NA | LGA | ORD | NA | 733 | 7 | 0 | 2013-09-29 07:00:00 |
2013 | 9 | 30 | 559 | 600 | -1 | NA | 715 | NA | WN | 464 | N411WN | EWR | MDW | NA | 711 | 6 | 0 | 2013-09-30 06:00:00 |
2013 | 9 | 30 | NA | 1842 | NA | NA | 2019 | NA | EV | 5274 | N740EV | LGA | BNA | NA | 764 | 18 | 42 | 2013-09-30 18:00:00 |
2013 | 9 | 30 | NA | 1455 | NA | NA | 1634 | NA | 9E | 3393 | NA | JFK | DCA | NA | 213 | 14 | 55 | 2013-09-30 14:00:00 |
2013 | 9 | 30 | NA | 2200 | NA | NA | 2312 | NA | 9E | 3525 | NA | LGA | SYR | NA | 198 | 22 | 0 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | NA | 1210 | NA | NA | 1330 | NA | MQ | 3461 | N535MQ | LGA | BNA | NA | 764 | 12 | 10 | 2013-09-30 12:00:00 |
2013 | 9 | 30 | NA | 1159 | NA | NA | 1344 | NA | MQ | 3572 | N511MQ | LGA | CLE | NA | 419 | 11 | 59 | 2013-09-30 11:00:00 |
2013 | 9 | 30 | NA | 840 | NA | NA | 1020 | NA | MQ | 3531 | N839MQ | LGA | RDU | NA | 431 | 8 | 40 | 2013-09-30 08:00:00 |
使用管道的多重组合
<- group_by(flights,dest)
by_dest by_dest
year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <dbl> | <int> | <int> | <dbl> | <chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dttm> |
2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 555 | 600 | -5 | 913 | 854 | 19 | B6 | 507 | N516JB | EWR | FLL | 158 | 1065 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 557 | 600 | -3 | 709 | 723 | -14 | EV | 5708 | N829AS | LGA | IAD | 53 | 229 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 557 | 600 | -3 | 838 | 846 | -8 | B6 | 79 | N593JB | JFK | MCO | 140 | 944 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 753 | 745 | 8 | AA | 301 | N3ALAA | LGA | ORD | 138 | 733 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 849 | 851 | -2 | B6 | 49 | N793JB | JFK | PBI | 149 | 1028 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 853 | 856 | -3 | B6 | 71 | N657JB | JFK | TPA | 158 | 1005 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 924 | 917 | 7 | UA | 194 | N29129 | JFK | LAX | 345 | 2475 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 558 | 600 | -2 | 923 | 937 | -14 | UA | 1124 | N53441 | EWR | SFO | 361 | 2565 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 559 | 600 | -1 | 941 | 910 | 31 | AA | 707 | N3DUAA | LGA | DFW | 257 | 1389 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 559 | 559 | 0 | 702 | 706 | -4 | B6 | 1806 | N708JB | JFK | BOS | 44 | 187 | 5 | 59 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 559 | 600 | -1 | 854 | 902 | -8 | UA | 1187 | N76515 | EWR | LAS | 337 | 2227 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 600 | 600 | 0 | 851 | 858 | -7 | B6 | 371 | N595JB | LGA | FLL | 152 | 1076 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 600 | 600 | 0 | 837 | 825 | 12 | MQ | 4650 | N542MQ | LGA | ATL | 134 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 601 | 600 | 1 | 844 | 850 | -6 | B6 | 343 | N644JB | EWR | PBI | 147 | 1023 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 602 | 610 | -8 | 812 | 820 | -8 | DL | 1919 | N971DL | LGA | MSP | 170 | 1020 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 602 | 605 | -3 | 821 | 805 | 16 | MQ | 4401 | N730MQ | LGA | DTW | 105 | 502 | 6 | 5 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 606 | 610 | -4 | 858 | 910 | -12 | AA | 1895 | N633AA | EWR | MIA | 152 | 1085 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 606 | 610 | -4 | 837 | 845 | -8 | DL | 1743 | N3739P | JFK | ATL | 128 | 760 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 607 | 607 | 0 | 858 | 915 | -17 | UA | 1077 | N53442 | EWR | MIA | 157 | 1085 | 6 | 7 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 608 | 600 | 8 | 807 | 735 | 32 | MQ | 3768 | N9EAMQ | EWR | ORD | 139 | 719 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 611 | 600 | 11 | 945 | 931 | 14 | UA | 303 | N532UA | JFK | SFO | 366 | 2586 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 613 | 610 | 3 | 925 | 921 | 4 | B6 | 135 | N635JB | JFK | RSW | 175 | 1074 | 6 | 10 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 615 | 615 | 0 | 1039 | 1100 | -21 | B6 | 709 | N794JB | JFK | SJU | 182 | 1598 | 6 | 15 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 615 | 615 | 0 | 833 | 842 | -9 | DL | 575 | N326NB | EWR | ATL | 120 | 746 | 6 | 15 | 2013-01-01 06:00:00 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 9 | 30 | 2123 | 2125 | -2 | 2223 | 2247 | -24 | EV | 5489 | N712EV | LGA | CHO | 45 | 305 | 21 | 25 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2127 | 2129 | -2 | 2314 | 2323 | -9 | EV | 3833 | N16546 | EWR | CLT | 72 | 529 | 21 | 29 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2128 | 2130 | -2 | 2328 | 2359 | -31 | B6 | 97 | N807JB | JFK | DEN | 213 | 1626 | 21 | 30 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2129 | 2059 | 30 | 2230 | 2232 | -2 | EV | 5048 | N751EV | LGA | RIC | 45 | 292 | 20 | 59 | 2013-09-30 20:00:00 |
2013 | 9 | 30 | 2131 | 2140 | -9 | 2225 | 2255 | -30 | MQ | 3621 | N807MQ | JFK | DCA | 36 | 213 | 21 | 40 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2140 | 2140 | 0 | 10 | 40 | -30 | AA | 185 | N335AA | JFK | LAX | 298 | 2475 | 21 | 40 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2142 | 2129 | 13 | 2250 | 2239 | 11 | EV | 4509 | N12957 | EWR | PWM | 47 | 284 | 21 | 29 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2145 | 2145 | 0 | 115 | 140 | -25 | B6 | 1103 | N633JB | JFK | SJU | 192 | 1598 | 21 | 45 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2147 | 2137 | 10 | 30 | 27 | 3 | B6 | 1371 | N627JB | LGA | FLL | 139 | 1076 | 21 | 37 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2149 | 2156 | -7 | 2245 | 2308 | -23 | UA | 523 | N813UA | EWR | BOS | 37 | 200 | 21 | 56 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2150 | 2159 | -9 | 2250 | 2306 | -16 | EV | 3842 | N10575 | EWR | MHT | 39 | 209 | 21 | 59 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2159 | 1845 | 194 | 2344 | 2030 | 194 | 9E | 3320 | N906XJ | JFK | BUF | 50 | 301 | 18 | 45 | 2013-09-30 18:00:00 |
2013 | 9 | 30 | 2203 | 2205 | -2 | 2339 | 2331 | 8 | EV | 5311 | N722EV | LGA | BGR | 61 | 378 | 22 | 5 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2207 | 2140 | 27 | 2257 | 2250 | 7 | MQ | 3660 | N532MQ | LGA | BNA | 97 | 764 | 21 | 40 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2211 | 2059 | 72 | 2339 | 2242 | 57 | EV | 4672 | N12145 | EWR | STL | 120 | 872 | 20 | 59 | 2013-09-30 20:00:00 |
2013 | 9 | 30 | 2231 | 2245 | -14 | 2335 | 2356 | -21 | B6 | 108 | N193JB | JFK | PWM | 48 | 273 | 22 | 45 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2233 | 2113 | 80 | 112 | 30 | 42 | UA | 471 | N578UA | EWR | SFO | 318 | 2565 | 21 | 13 | 2013-09-30 21:00:00 |
2013 | 9 | 30 | 2235 | 2001 | 154 | 59 | 2249 | 130 | B6 | 1083 | N804JB | JFK | MCO | 123 | 944 | 20 | 1 | 2013-09-30 20:00:00 |
2013 | 9 | 30 | 2237 | 2245 | -8 | 2345 | 2353 | -8 | B6 | 234 | N318JB | JFK | BTV | 43 | 266 | 22 | 45 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2240 | 2245 | -5 | 2334 | 2351 | -17 | B6 | 1816 | N354JB | JFK | SYR | 41 | 209 | 22 | 45 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2240 | 2250 | -10 | 2347 | 7 | -20 | B6 | 2002 | N281JB | JFK | BUF | 52 | 301 | 22 | 50 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2241 | 2246 | -5 | 2345 | 1 | -16 | B6 | 486 | N346JB | JFK | ROC | 47 | 264 | 22 | 46 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2307 | 2255 | 12 | 2359 | 2358 | 1 | B6 | 718 | N565JB | JFK | BOS | 33 | 187 | 22 | 55 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | 2349 | 2359 | -10 | 325 | 350 | -25 | B6 | 745 | N516JB | JFK | PSE | 196 | 1617 | 23 | 59 | 2013-09-30 23:00:00 |
2013 | 9 | 30 | NA | 1842 | NA | NA | 2019 | NA | EV | 5274 | N740EV | LGA | BNA | NA | 764 | 18 | 42 | 2013-09-30 18:00:00 |
2013 | 9 | 30 | NA | 1455 | NA | NA | 1634 | NA | 9E | 3393 | NA | JFK | DCA | NA | 213 | 14 | 55 | 2013-09-30 14:00:00 |
2013 | 9 | 30 | NA | 2200 | NA | NA | 2312 | NA | 9E | 3525 | NA | LGA | SYR | NA | 198 | 22 | 0 | 2013-09-30 22:00:00 |
2013 | 9 | 30 | NA | 1210 | NA | NA | 1330 | NA | MQ | 3461 | N535MQ | LGA | BNA | NA | 764 | 12 | 10 | 2013-09-30 12:00:00 |
2013 | 9 | 30 | NA | 1159 | NA | NA | 1344 | NA | MQ | 3572 | N511MQ | LGA | CLE | NA | 419 | 11 | 59 | 2013-09-30 11:00:00 |
2013 | 9 | 30 | NA | 840 | NA | NA | 1020 | NA | MQ | 3531 | N839MQ | LGA | RDU | NA | 431 | 8 | 40 | 2013-09-30 08:00:00 |
<- summarize(by_dest,count=n(),dist = mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm=TRUE)) delay
delay
dest | count | dist | delay |
---|---|---|---|
<chr> | <int> | <dbl> | <dbl> |
ABQ | 254 | 1826.0000 | 4.381890 |
ACK | 265 | 199.0000 | 4.852273 |
ALB | 439 | 143.0000 | 14.397129 |
ANC | 8 | 3370.0000 | -2.500000 |
ATL | 17215 | 757.1082 | 11.300113 |
AUS | 2439 | 1514.2530 | 6.019909 |
AVL | 275 | 583.5818 | 8.003831 |
BDL | 443 | 116.0000 | 7.048544 |
BGR | 375 | 378.0000 | 8.027933 |
BHM | 297 | 865.9966 | 16.877323 |
BNA | 6333 | 758.2135 | 11.812459 |
BOS | 15508 | 190.6370 | 2.914392 |
BQN | 896 | 1578.9833 | 8.245495 |
BTV | 2589 | 265.0915 | 8.950996 |
BUF | 4681 | 296.8084 | 8.945952 |
BUR | 371 | 2465.0000 | 8.175676 |
BWI | 1781 | 179.4183 | 10.726734 |
BZN | 36 | 1882.0000 | 7.600000 |
CAE | 116 | 603.5517 | 41.764151 |
CAK | 864 | 397.0000 | 19.698337 |
CHO | 52 | 305.0000 | 9.500000 |
CHS | 2884 | 632.9168 | 10.592968 |
CLE | 4573 | 414.1743 | 9.181611 |
CLT | 14064 | 538.0273 | 7.360319 |
CMH | 3524 | 476.5551 | 10.601323 |
CRW | 138 | 444.0000 | 14.671642 |
CVG | 3941 | 575.1599 | 15.364564 |
DAY | 1525 | 537.1023 | 12.680486 |
DCA | 9705 | 211.0062 | 9.066952 |
DEN | 7266 | 1614.6784 | 8.606500 |
⋮ | ⋮ | ⋮ | ⋮ |
PIT | 2875 | 334.0612 | 7.6809905 |
PSE | 365 | 1617.0000 | 7.8715084 |
PSP | 19 | 2378.0000 | -12.7222222 |
PVD | 376 | 160.0000 | 16.2346369 |
PWM | 2352 | 276.1284 | 11.6604021 |
RDU | 8163 | 426.7577 | 10.0523810 |
RIC | 2454 | 281.4046 | 20.1112532 |
ROC | 2416 | 259.2508 | 11.5606446 |
RSW | 3537 | 1072.8533 | 3.2381496 |
SAN | 2737 | 2437.2992 | 3.1391657 |
SAT | 686 | 1578.3411 | 6.9453718 |
SAV | 804 | 709.1841 | 15.1295060 |
SBN | 10 | 645.4000 | 6.5000000 |
SDF | 1157 | 645.9836 | 12.6693841 |
SEA | 3923 | 2412.6653 | -1.0990991 |
SFO | 13331 | 2577.9236 | 2.6728915 |
SJC | 329 | 2569.0000 | 3.4481707 |
SJU | 5819 | 1599.8336 | 2.5205266 |
SLC | 2467 | 1986.9866 | 0.1762546 |
SMF | 284 | 2521.0000 | 12.1099291 |
SNA | 825 | 2434.0000 | -7.8682266 |
SRQ | 1211 | 1044.6515 | 3.0824313 |
STL | 4339 | 878.7232 | 11.0784645 |
STT | 522 | 1626.9828 | -3.8359073 |
SYR | 1761 | 205.9216 | 8.9039250 |
TPA | 7466 | 1003.9356 | 7.4085250 |
TUL | 315 | 1215.0000 | 33.6598639 |
TVC | 101 | 652.3861 | 12.9684211 |
TYS | 631 | 638.8098 | 24.0692042 |
XNA | 1036 | 1142.5058 | 7.4657258 |
<- dplyr::filter(delay,count>20,dest!= "HNL")
delay delay
dest | count | dist | delay |
---|---|---|---|
<chr> | <int> | <dbl> | <dbl> |
ABQ | 254 | 1826.0000 | 4.3818898 |
ACK | 265 | 199.0000 | 4.8522727 |
ALB | 439 | 143.0000 | 14.3971292 |
ATL | 17215 | 757.1082 | 11.3001128 |
AUS | 2439 | 1514.2530 | 6.0199088 |
AVL | 275 | 583.5818 | 8.0038314 |
BDL | 443 | 116.0000 | 7.0485437 |
BGR | 375 | 378.0000 | 8.0279330 |
BHM | 297 | 865.9966 | 16.8773234 |
BNA | 6333 | 758.2135 | 11.8124589 |
BOS | 15508 | 190.6370 | 2.9143922 |
BQN | 896 | 1578.9833 | 8.2454955 |
BTV | 2589 | 265.0915 | 8.9509960 |
BUF | 4681 | 296.8084 | 8.9459519 |
BUR | 371 | 2465.0000 | 8.1756757 |
BWI | 1781 | 179.4183 | 10.7267338 |
BZN | 36 | 1882.0000 | 7.6000000 |
CAE | 116 | 603.5517 | 41.7641509 |
CAK | 864 | 397.0000 | 19.6983373 |
CHO | 52 | 305.0000 | 9.5000000 |
CHS | 2884 | 632.9168 | 10.5929685 |
CLE | 4573 | 414.1743 | 9.1816113 |
CLT | 14064 | 538.0273 | 7.3603189 |
CMH | 3524 | 476.5551 | 10.6013229 |
CRW | 138 | 444.0000 | 14.6716418 |
CVG | 3941 | 575.1599 | 15.3645638 |
DAY | 1525 | 537.1023 | 12.6804861 |
DCA | 9705 | 211.0062 | 9.0669520 |
DEN | 7266 | 1614.6784 | 8.6065002 |
DFW | 8738 | 1383.0430 | 0.3221268 |
⋮ | ⋮ | ⋮ | ⋮ |
PHL | 1632 | 94.32353 | 10.1271901 |
PHX | 4656 | 2141.30326 | 2.0970473 |
PIT | 2875 | 334.06122 | 7.6809905 |
PSE | 365 | 1617.00000 | 7.8715084 |
PVD | 376 | 160.00000 | 16.2346369 |
PWM | 2352 | 276.12840 | 11.6604021 |
RDU | 8163 | 426.75769 | 10.0523810 |
RIC | 2454 | 281.40465 | 20.1112532 |
ROC | 2416 | 259.25083 | 11.5606446 |
RSW | 3537 | 1072.85327 | 3.2381496 |
SAN | 2737 | 2437.29923 | 3.1391657 |
SAT | 686 | 1578.34111 | 6.9453718 |
SAV | 804 | 709.18408 | 15.1295060 |
SDF | 1157 | 645.98358 | 12.6693841 |
SEA | 3923 | 2412.66531 | -1.0990991 |
SFO | 13331 | 2577.92356 | 2.6728915 |
SJC | 329 | 2569.00000 | 3.4481707 |
SJU | 5819 | 1599.83365 | 2.5205266 |
SLC | 2467 | 1986.98662 | 0.1762546 |
SMF | 284 | 2521.00000 | 12.1099291 |
SNA | 825 | 2434.00000 | -7.8682266 |
SRQ | 1211 | 1044.65153 | 3.0824313 |
STL | 4339 | 878.72321 | 11.0784645 |
STT | 522 | 1626.98276 | -3.8359073 |
SYR | 1761 | 205.92164 | 8.9039250 |
TPA | 7466 | 1003.93557 | 7.4085250 |
TUL | 315 | 1215.00000 | 33.6598639 |
TVC | 101 | 652.38614 | 12.9684211 |
TYS | 631 | 638.80983 | 24.0692042 |
XNA | 1036 | 1142.50579 | 7.4657258 |
ggplot(data= delay,mapping=aes(x=dist,y=delay))+
geom_point(aes(size=count),alpha=1/3)+
geom_smooth(se=FALSE)
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
另一个将数据传入ggplot
的方法是使用管道。
<-flights%>%
delaysgroup_by(dest)%>%
summarize(count=n(),
dist=mean(distance,na.rm=TRUE),
delay=mean(arr_delay,na.rm=TRUE)
%>%
)::filter(count>20,dest !="HNL")
dplyrprint(delays)
# A tibble: 96 × 4
dest count dist delay
<chr> <int> <dbl> <dbl>
1 ABQ 254 1826 4.38
2 ACK 265 199 4.85
3 ALB 439 143 14.4
4 ATL 17215 757. 11.3
5 AUS 2439 1514. 6.02
6 AVL 275 584. 8.00
7 BDL 443 116 7.05
8 BGR 375 378 8.03
9 BHM 297 866. 16.9
10 BNA 6333 758. 11.8
# … with 86 more rows
ggplot(data=smaller,mapping=aes(x=carat))+
geom_histogram(binwidth=0.1)
所有的聚合函数遵循的一个一般原则是:若输入中存在缺失值,那么输出也会是缺失值。聚合函数中还有一大好处是有na.rm()
函数能够帮助去除缺失值。
计数
n()
或非缺失值的计数sum(!is_na())
,可以检查一下是否有少量的数据作为结论
<-flights%>%
not_cancelled::filter(!is.na(dep_delay),!is.na(arr_delay)) dplyr
将所有窜缺失值的航班数据找出
<- not_cancelled %>%
delaysgroup_by(tailnum) %>%
summarize(delay=mean(arr_delay)
)
ggplot(data=delays,mapping=aes(x=delay))+
geom_freqpoly(binwidth=10)
<- not_cancelled %>%
delaysgroup_by(tailnum) %>%
summarize(delay=mean(arr_delay,na.rm=TRUE),
n=n()
) delays
tailnum | delay | n |
---|---|---|
<chr> | <dbl> | <int> |
D942DN | 31.5000000 | 4 |
N0EGMQ | 9.9829545 | 352 |
N10156 | 12.7172414 | 145 |
N102UW | 2.9375000 | 48 |
N103US | -6.9347826 | 46 |
N104UW | 1.8043478 | 46 |
N10575 | 20.6914498 | 269 |
N105UW | -0.2666667 | 45 |
N107US | -5.7317073 | 41 |
N108UW | -1.2500000 | 60 |
N109UW | -2.5208333 | 48 |
N110UW | 2.8000000 | 40 |
N11106 | 14.8809524 | 126 |
N11107 | 15.0143885 | 139 |
N11109 | 14.8510638 | 141 |
N11113 | 15.7619048 | 126 |
N11119 | 30.3065693 | 137 |
N11121 | 10.3061224 | 147 |
N11127 | 13.6050420 | 119 |
N11137 | 20.5514019 | 107 |
N11140 | 18.3120567 | 141 |
N11150 | 8.8914729 | 129 |
N11155 | 12.9895833 | 96 |
N11164 | 21.6496350 | 137 |
N11165 | 8.3466667 | 150 |
N11176 | 20.4045802 | 131 |
N11181 | 9.0750000 | 120 |
N11184 | 6.9843750 | 128 |
N11187 | 11.1904762 | 126 |
N11189 | 6.7372263 | 137 |
⋮ | ⋮ | ⋮ |
N985AT | 15.0384615 | 26 |
N985DL | 0.2539683 | 63 |
N986AT | 0.1250000 | 24 |
N986DL | 5.5833333 | 72 |
N987AT | 37.3846154 | 26 |
N987DL | -3.2909091 | 55 |
N988AT | 44.3428571 | 35 |
N988DL | 9.4716981 | 53 |
N989AT | 28.1935484 | 62 |
N989DL | 11.0340909 | 88 |
N990AT | 16.3857143 | 70 |
N990DL | 12.4285714 | 56 |
N991AT | 15.6800000 | 25 |
N991DL | 7.3369565 | 92 |
N992AT | 17.7837838 | 37 |
N992DL | -6.6491228 | 57 |
N993AT | 10.4042553 | 47 |
N993DL | 16.9811321 | 53 |
N994AT | 31.4838710 | 31 |
N994DL | 5.0327869 | 61 |
N995AT | 26.4705882 | 17 |
N995DL | 1.9298246 | 57 |
N996AT | 6.5384615 | 26 |
N996DL | 0.5247525 | 101 |
N997AT | 16.3023256 | 43 |
N997DL | 4.9032258 | 62 |
N998AT | 29.9600000 | 25 |
N998DL | 16.3947368 | 76 |
N999DN | 14.3114754 | 61 |
N9EAMQ | 9.2352941 | 238 |
ggplot(data=delays,mapping=aes(x=n,y=delay))+
geom_point(alpha=1/10)
进一步还可以筛取少量的分组,能够得到更好的展示效果
%>%
delays::filter(n>25)%>%
dplyrggplot(mapping=aes(x=n,y=delay))+
geom_point(alpha=1/3)
常见的摘要函数
位置度量:使用mean()
但median()
也较为常用。均值是总数除以个数;中位数则是:50%的x大于它,同时50%的x小于它。
%>%
not_cancelledgroup_by(year,month,day)%>%
summarize(avg_delay1=mean(arr_delay,na.rm=TRUE),
n=n(),
avg_delay2=mean(arr_delay[arr_delay>0]))
`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.
year | month | day | avg_delay1 | n | avg_delay2 |
---|---|---|---|---|---|
<int> | <int> | <int> | <dbl> | <int> | <dbl> |
2013 | 1 | 1 | 12.6510229 | 831 | 32.48156 |
2013 | 1 | 2 | 12.6928879 | 928 | 32.02991 |
2013 | 1 | 3 | 5.7333333 | 900 | 27.66087 |
2013 | 1 | 4 | -1.9328194 | 908 | 28.30976 |
2013 | 1 | 5 | -1.5258020 | 717 | 22.55882 |
2013 | 1 | 6 | 4.2364294 | 829 | 24.37270 |
2013 | 1 | 7 | -4.9473118 | 930 | 27.76132 |
2013 | 1 | 8 | -3.2275785 | 892 | 20.78909 |
2013 | 1 | 9 | -0.2642777 | 893 | 25.63415 |
2013 | 1 | 10 | -5.8988159 | 929 | 27.34545 |
2013 | 1 | 11 | -4.7622683 | 917 | 26.15984 |
2013 | 1 | 12 | -13.0161527 | 681 | 23.47170 |
2013 | 1 | 13 | 14.9318463 | 807 | 52.54891 |
2013 | 1 | 14 | 3.6403034 | 923 | 22.44444 |
2013 | 1 | 15 | 0.4256527 | 881 | 19.71003 |
2013 | 1 | 16 | 34.2473623 | 853 | 46.08346 |
2013 | 1 | 17 | 6.4928962 | 915 | 25.60268 |
2013 | 1 | 18 | 1.8417582 | 910 | 25.05382 |
2013 | 1 | 19 | -8.5260805 | 671 | 24.28906 |
2013 | 1 | 20 | 3.7400768 | 781 | 29.02913 |
2013 | 1 | 21 | 6.3159645 | 902 | 32.24359 |
2013 | 1 | 22 | 12.2768362 | 885 | 33.88323 |
2013 | 1 | 23 | 6.9177928 | 888 | 33.05181 |
2013 | 1 | 24 | 15.4273128 | 908 | 42.08824 |
2013 | 1 | 25 | 27.0989761 | 879 | 55.73694 |
2013 | 1 | 26 | 0.7511177 | 671 | 29.05833 |
2013 | 1 | 27 | -1.3089330 | 806 | 36.02479 |
2013 | 1 | 28 | 9.3197200 | 857 | 38.70361 |
2013 | 1 | 29 | -6.5558113 | 869 | 30.13333 |
2013 | 1 | 30 | 25.9108040 | 796 | 57.42889 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 12 | 2 | 0.4509018 | 998 | 31.55488 |
2013 | 12 | 3 | 2.2227979 | 965 | 24.45926 |
2013 | 12 | 4 | -0.2222222 | 945 | 28.40643 |
2013 | 12 | 5 | 51.6662546 | 809 | 80.83571 |
2013 | 12 | 6 | 21.1194690 | 904 | 37.51089 |
2013 | 12 | 7 | 5.0570571 | 666 | 27.18707 |
2013 | 12 | 8 | 36.9118012 | 805 | 56.16000 |
2013 | 12 | 9 | 42.5755556 | 900 | 53.07383 |
2013 | 12 | 10 | 44.5087957 | 739 | 52.19440 |
2013 | 12 | 11 | 10.9024390 | 943 | 24.90559 |
2013 | 12 | 12 | 5.3399374 | 959 | 23.46320 |
2013 | 12 | 13 | 0.1630322 | 963 | 22.14121 |
2013 | 12 | 14 | 46.3975045 | 561 | 59.97802 |
2013 | 12 | 15 | 15.8700980 | 816 | 38.98678 |
2013 | 12 | 16 | 7.3706806 | 955 | 27.81385 |
2013 | 12 | 17 | 55.8718563 | 835 | 60.46795 |
2013 | 12 | 18 | 11.0798319 | 952 | 26.59503 |
2013 | 12 | 19 | 9.0062112 | 966 | 31.04008 |
2013 | 12 | 20 | 13.8201663 | 962 | 35.24905 |
2013 | 12 | 21 | 12.4569288 | 801 | 36.85539 |
2013 | 12 | 22 | 23.8995485 | 886 | 45.64875 |
2013 | 12 | 23 | 32.2260417 | 960 | 50.03397 |
2013 | 12 | 24 | -1.0438830 | 752 | 21.16961 |
2013 | 12 | 25 | -1.8993007 | 715 | 30.09500 |
2013 | 12 | 26 | 7.1741935 | 930 | 34.54869 |
2013 | 12 | 27 | -0.1488033 | 961 | 29.04683 |
2013 | 12 | 28 | -3.2595326 | 813 | 25.60769 |
2013 | 12 | 29 | 18.7638249 | 868 | 47.25636 |
2013 | 12 | 30 | 10.0577125 | 953 | 31.24380 |
2013 | 12 | 31 | 6.2121212 | 759 | 24.45596 |
head(delays)
tailnum | delay | n |
---|---|---|
<chr> | <dbl> | <int> |
D942DN | 31.500000 | 4 |
N0EGMQ | 9.982955 | 352 |
N10156 | 12.717241 | 145 |
N102UW | 2.937500 | 48 |
N103US | -6.934783 | 46 |
N104UW | 1.804348 | 46 |
summarize()
实际上是生成了变量能够作为数据框来使用
ggplot(data=delays,mapping=aes(x=n,y=delay))+
geom_point(alpha=1/10)
ggplot(data=smaller,mapping=aes(x=carat,color=cut))+
geom_freqpoly(binwidth=0.1)
典型值
ggplot(data=smaller,mapping=aes(x=carat))+
geom_histogram(binwidth=0.01)
head(faithful) #美国黄石公园喷泉的喷发时长
eruptions | waiting | |
---|---|---|
<dbl> | <dbl> | |
1 | 3.600 | 79 |
2 | 1.800 | 54 |
3 | 3.333 | 74 |
4 | 2.283 | 62 |
5 | 4.533 | 85 |
6 | 2.883 | 55 |
ggplot(data=faithful,mapping=aes(x=eruptions))+
geom_histogram(binwidth=0.25)
最终可以从图像中看出,喷发时间分为两组,一组以2分钟为主,另一组以4-5分钟为组
异常值
head(diamonds)
carat | cut | color | clarity | depth | table | price | x | y | z |
---|---|---|---|---|---|---|---|---|---|
<dbl> | <ord> | <ord> | <ord> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
0.29 | Premium | I | VS2 | 62.4 | 58 | 334 | 4.20 | 4.23 | 2.63 |
0.31 | Good | J | SI2 | 63.3 | 58 | 335 | 4.34 | 4.35 | 2.75 |
0.24 | Very Good | J | VVS2 | 62.8 | 57 | 336 | 3.94 | 3.96 | 2.48 |
ggplot(diamonds)+
geom_histogram(mapping=aes(x=y),binwidth=0.5) #x坐标表示的是y值的大小,geom_histogram函数将x坐标进行统计再映射到途中。
这个分箱导致异常值的分箱高度太低。使用coord_cartesian()
将y轴靠近0部分放大
ggplot(diamonds)+
geom_histogram(mapping=aes(x=y),binwidth=0.5)+
coord_cartesian(ylim=c(0,50))
<- diamonds %>%
unusual ::filter(y < 3 | y > 20)%>%
dplyrarrange(y)
unusual
carat | cut | color | clarity | depth | table | price | x | y | z |
---|---|---|---|---|---|---|---|---|---|
<dbl> | <ord> | <ord> | <ord> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
1.00 | Very Good | H | VS2 | 63.3 | 53 | 5139 | 0.00 | 0.0 | 0.00 |
1.14 | Fair | G | VS1 | 57.5 | 67 | 6381 | 0.00 | 0.0 | 0.00 |
1.56 | Ideal | G | VS2 | 62.2 | 54 | 12800 | 0.00 | 0.0 | 0.00 |
1.20 | Premium | D | VVS1 | 62.1 | 59 | 15686 | 0.00 | 0.0 | 0.00 |
2.25 | Premium | H | SI2 | 62.8 | 59 | 18034 | 0.00 | 0.0 | 0.00 |
0.71 | Good | F | SI2 | 64.1 | 60 | 2130 | 0.00 | 0.0 | 0.00 |
0.71 | Good | F | SI2 | 64.1 | 60 | 2130 | 0.00 | 0.0 | 0.00 |
0.51 | Ideal | E | VS1 | 61.8 | 55 | 2075 | 5.15 | 31.8 | 5.12 |
2.00 | Premium | H | SI2 | 58.9 | 57 | 12210 | 8.09 | 58.9 | 8.06 |
缺失值
- 丢弃所有可疑的行
<-diamonds%>%
diamonds2::filter(between(y,3,20)) dplyr
- 使用缺失值代替异常值,使用
mutate()
来创建一个新的变量来代替原有的变量
<-diamonds%>%
diamonds2mutate(y=ifelse(y<3|y>20,NA,y)) #mutate 在原有的属性进行操作
diamonds2
carat | cut | color | clarity | depth | table | price | x | y | z |
---|---|---|---|---|---|---|---|---|---|
<dbl> | <ord> | <ord> | <ord> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
0.29 | Premium | I | VS2 | 62.4 | 58 | 334 | 4.20 | 4.23 | 2.63 |
0.31 | Good | J | SI2 | 63.3 | 58 | 335 | 4.34 | 4.35 | 2.75 |
0.24 | Very Good | J | VVS2 | 62.8 | 57 | 336 | 3.94 | 3.96 | 2.48 |
0.24 | Very Good | I | VVS1 | 62.3 | 57 | 336 | 3.95 | 3.98 | 2.47 |
0.26 | Very Good | H | SI1 | 61.9 | 55 | 337 | 4.07 | 4.11 | 2.53 |
0.22 | Fair | E | VS2 | 65.1 | 61 | 337 | 3.87 | 3.78 | 2.49 |
0.23 | Very Good | H | VS1 | 59.4 | 61 | 338 | 4.00 | 4.05 | 2.39 |
0.30 | Good | J | SI1 | 64.0 | 55 | 339 | 4.25 | 4.28 | 2.73 |
0.23 | Ideal | J | VS1 | 62.8 | 56 | 340 | 3.93 | 3.90 | 2.46 |
0.22 | Premium | F | SI1 | 60.4 | 61 | 342 | 3.88 | 3.84 | 2.33 |
0.31 | Ideal | J | SI2 | 62.2 | 54 | 344 | 4.35 | 4.37 | 2.71 |
0.20 | Premium | E | SI2 | 60.2 | 62 | 345 | 3.79 | 3.75 | 2.27 |
0.32 | Premium | E | I1 | 60.9 | 58 | 345 | 4.38 | 4.42 | 2.68 |
0.30 | Ideal | I | SI2 | 62.0 | 54 | 348 | 4.31 | 4.34 | 2.68 |
0.30 | Good | J | SI1 | 63.4 | 54 | 351 | 4.23 | 4.29 | 2.70 |
0.30 | Good | J | SI1 | 63.8 | 56 | 351 | 4.23 | 4.26 | 2.71 |
0.30 | Very Good | J | SI1 | 62.7 | 59 | 351 | 4.21 | 4.27 | 2.66 |
0.30 | Good | I | SI2 | 63.3 | 56 | 351 | 4.26 | 4.30 | 2.71 |
0.23 | Very Good | E | VS2 | 63.8 | 55 | 352 | 3.85 | 3.92 | 2.48 |
0.23 | Very Good | H | VS1 | 61.0 | 57 | 353 | 3.94 | 3.96 | 2.41 |
0.31 | Very Good | J | SI1 | 59.4 | 62 | 353 | 4.39 | 4.43 | 2.62 |
0.31 | Very Good | J | SI1 | 58.1 | 62 | 353 | 4.44 | 4.47 | 2.59 |
0.23 | Very Good | G | VVS2 | 60.4 | 58 | 354 | 3.97 | 4.01 | 2.41 |
0.24 | Premium | I | VS1 | 62.5 | 57 | 355 | 3.97 | 3.94 | 2.47 |
0.30 | Very Good | J | VS2 | 62.2 | 57 | 357 | 4.28 | 4.30 | 2.67 |
0.23 | Very Good | D | VS2 | 60.5 | 61 | 357 | 3.96 | 3.97 | 2.40 |
0.23 | Very Good | F | VS1 | 60.9 | 57 | 357 | 3.96 | 3.99 | 2.42 |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
0.70 | Premium | E | SI1 | 60.5 | 58 | 2753 | 5.74 | 5.77 | 3.48 |
0.57 | Premium | E | IF | 59.8 | 60 | 2753 | 5.43 | 5.38 | 3.23 |
0.61 | Premium | F | VVS1 | 61.8 | 59 | 2753 | 5.48 | 5.40 | 3.36 |
0.80 | Good | G | VS2 | 64.2 | 58 | 2753 | 5.84 | 5.81 | 3.74 |
0.84 | Good | I | VS1 | 63.7 | 59 | 2753 | 5.94 | 5.90 | 3.77 |
0.77 | Ideal | E | SI2 | 62.1 | 56 | 2753 | 5.84 | 5.86 | 3.63 |
0.74 | Good | D | SI1 | 63.1 | 59 | 2753 | 5.71 | 5.74 | 3.61 |
0.90 | Very Good | J | SI1 | 63.2 | 60 | 2753 | 6.12 | 6.09 | 3.86 |
0.76 | Premium | I | VS1 | 59.3 | 62 | 2753 | 5.93 | 5.85 | 3.49 |
0.76 | Ideal | I | VVS1 | 62.2 | 55 | 2753 | 5.89 | 5.87 | 3.66 |
0.70 | Very Good | E | VS2 | 62.4 | 60 | 2755 | 5.57 | 5.61 | 3.49 |
0.70 | Very Good | E | VS2 | 62.8 | 60 | 2755 | 5.59 | 5.65 | 3.53 |
0.70 | Very Good | D | VS1 | 63.1 | 59 | 2755 | 5.67 | 5.58 | 3.55 |
0.73 | Ideal | I | VS2 | 61.3 | 56 | 2756 | 5.80 | 5.84 | 3.57 |
0.73 | Ideal | I | VS2 | 61.6 | 55 | 2756 | 5.82 | 5.84 | 3.59 |
0.79 | Ideal | I | SI1 | 61.6 | 56 | 2756 | 5.95 | 5.97 | 3.67 |
0.71 | Ideal | E | SI1 | 61.9 | 56 | 2756 | 5.71 | 5.73 | 3.54 |
0.79 | Good | F | SI1 | 58.1 | 59 | 2756 | 6.06 | 6.13 | 3.54 |
0.79 | Premium | E | SI2 | 61.4 | 58 | 2756 | 6.03 | 5.96 | 3.68 |
0.71 | Ideal | G | VS1 | 61.4 | 56 | 2756 | 5.76 | 5.73 | 3.53 |
0.71 | Premium | E | SI1 | 60.5 | 55 | 2756 | 5.79 | 5.74 | 3.49 |
0.71 | Premium | F | SI1 | 59.8 | 62 | 2756 | 5.74 | 5.73 | 3.43 |
0.70 | Very Good | E | VS2 | 60.5 | 59 | 2757 | 5.71 | 5.76 | 3.47 |
0.70 | Very Good | E | VS2 | 61.2 | 59 | 2757 | 5.69 | 5.72 | 3.49 |
0.72 | Premium | D | SI1 | 62.7 | 59 | 2757 | 5.69 | 5.73 | 3.58 |
0.72 | Ideal | D | SI1 | 60.8 | 57 | 2757 | 5.75 | 5.76 | 3.50 |
0.72 | Good | D | SI1 | 63.1 | 55 | 2757 | 5.69 | 5.75 | 3.61 |
0.70 | Very Good | D | SI1 | 62.8 | 60 | 2757 | 5.66 | 5.68 | 3.56 |
0.86 | Premium | H | SI2 | 61.0 | 58 | 2757 | 6.15 | 6.12 | 3.74 |
0.75 | Ideal | D | SI2 | 62.2 | 55 | 2757 | 5.83 | 5.87 | 3.64 |
ggplot(data=diamonds2,mapping=aes(x=x,y=y))+
geom_point(na.rm=TRUE)
相关变动
相关变动就是描述多个变量之间的行为。以可视化的方式进行查看
ggplot(data = diamonds, mapping= aes(x = price ))+
geom_freqpoly(mapping=aes(color=cut),binwidth=500)
ggplot(diamonds)+
geom_bar(mapping=aes(x=cut))
ggplot(data=diamonds,mapping=aes(x=cut,y=price))+
geom_boxplot()
ggplot(data=mpg,mapping=aes(x=class,y=hwy))+
geom_boxplot() #这里对于yx来说属于一个列联表分类数据。
%>%
diamondscount(color,cut)%>%
ggplot(mapping=aes(x=color,y=cut))+
geom_tile(mapping=aes(fill=n)) #aes()仍然是将fill放入其中
两个分类变量
%>%
diamondscount(color,cut)%>%
ggplot(mapping=aes(x=color,y=cut))+
geom_point(aes(color=n))
模式和模型
ggplot(data=faithful)+
geom_point(mapping=aes(x=eruptions,y=waiting))
数据处理
数据处理是一门艺术,将合适的形式倒入R,从而进行可视化和建模。
as_tibble(iris)
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <fct> |
5.1 | 3.5 | 1.4 | 0.2 | setosa |
4.9 | 3.0 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.3 | 0.2 | setosa |
4.6 | 3.1 | 1.5 | 0.2 | setosa |
5.0 | 3.6 | 1.4 | 0.2 | setosa |
5.4 | 3.9 | 1.7 | 0.4 | setosa |
4.6 | 3.4 | 1.4 | 0.3 | setosa |
5.0 | 3.4 | 1.5 | 0.2 | setosa |
4.4 | 2.9 | 1.4 | 0.2 | setosa |
4.9 | 3.1 | 1.5 | 0.1 | setosa |
5.4 | 3.7 | 1.5 | 0.2 | setosa |
4.8 | 3.4 | 1.6 | 0.2 | setosa |
4.8 | 3.0 | 1.4 | 0.1 | setosa |
4.3 | 3.0 | 1.1 | 0.1 | setosa |
5.8 | 4.0 | 1.2 | 0.2 | setosa |
5.7 | 4.4 | 1.5 | 0.4 | setosa |
5.4 | 3.9 | 1.3 | 0.4 | setosa |
5.1 | 3.5 | 1.4 | 0.3 | setosa |
5.7 | 3.8 | 1.7 | 0.3 | setosa |
5.1 | 3.8 | 1.5 | 0.3 | setosa |
5.4 | 3.4 | 1.7 | 0.2 | setosa |
5.1 | 3.7 | 1.5 | 0.4 | setosa |
4.6 | 3.6 | 1.0 | 0.2 | setosa |
5.1 | 3.3 | 1.7 | 0.5 | setosa |
4.8 | 3.4 | 1.9 | 0.2 | setosa |
5.0 | 3.0 | 1.6 | 0.2 | setosa |
5.0 | 3.4 | 1.6 | 0.4 | setosa |
5.2 | 3.5 | 1.5 | 0.2 | setosa |
5.2 | 3.4 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.6 | 0.2 | setosa |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
6.9 | 3.2 | 5.7 | 2.3 | virginica |
5.6 | 2.8 | 4.9 | 2.0 | virginica |
7.7 | 2.8 | 6.7 | 2.0 | virginica |
6.3 | 2.7 | 4.9 | 1.8 | virginica |
6.7 | 3.3 | 5.7 | 2.1 | virginica |
7.2 | 3.2 | 6.0 | 1.8 | virginica |
6.2 | 2.8 | 4.8 | 1.8 | virginica |
6.1 | 3.0 | 4.9 | 1.8 | virginica |
6.4 | 2.8 | 5.6 | 2.1 | virginica |
7.2 | 3.0 | 5.8 | 1.6 | virginica |
7.4 | 2.8 | 6.1 | 1.9 | virginica |
7.9 | 3.8 | 6.4 | 2.0 | virginica |
6.4 | 2.8 | 5.6 | 2.2 | virginica |
6.3 | 2.8 | 5.1 | 1.5 | virginica |
6.1 | 2.6 | 5.6 | 1.4 | virginica |
7.7 | 3.0 | 6.1 | 2.3 | virginica |
6.3 | 3.4 | 5.6 | 2.4 | virginica |
6.4 | 3.1 | 5.5 | 1.8 | virginica |
6.0 | 3.0 | 4.8 | 1.8 | virginica |
6.9 | 3.1 | 5.4 | 2.1 | virginica |
6.7 | 3.1 | 5.6 | 2.4 | virginica |
6.9 | 3.1 | 5.1 | 2.3 | virginica |
5.8 | 2.7 | 5.1 | 1.9 | virginica |
6.8 | 3.2 | 5.9 | 2.3 | virginica |
6.7 | 3.3 | 5.7 | 2.5 | virginica |
6.7 | 3.0 | 5.2 | 2.3 | virginica |
6.3 | 2.5 | 5.0 | 1.9 | virginica |
6.5 | 3.0 | 5.2 | 2.0 | virginica |
6.2 | 3.4 | 5.4 | 2.3 | virginica |
5.9 | 3.0 | 5.1 | 1.8 | virginica |
自行生成一个tibble
tibble(
x=1:5,
y=1,
z=x^2+y
)
x | y | z |
---|---|---|
<int> | <dbl> | <dbl> |
1 | 1 | 2 |
2 | 1 | 5 |
3 | 1 | 10 |
4 | 1 | 17 |
5 | 1 | 26 |
对两个分类变量的相关变动进行可视化表示,需要计算出每个变量组合的观测数量。(同时上述的geom_point(color=n)
也能实现类似的功能)
ggplot(data=diamonds)+
geom_count(mapping=aes(x=cut,y=color)) #将数量的大小投射到数据点的大小上
head(diamonds)
carat | cut | color | clarity | depth | table | price | x | y | z |
---|---|---|---|---|---|---|---|---|---|
<dbl> | <ord> | <ord> | <ord> | <dbl> | <dbl> | <int> | <dbl> | <dbl> | <dbl> |
0.23 | Ideal | E | SI2 | 61.5 | 55 | 326 | 3.95 | 3.98 | 2.43 |
0.21 | Premium | E | SI1 | 59.8 | 61 | 326 | 3.89 | 3.84 | 2.31 |
0.23 | Good | E | VS1 | 56.9 | 65 | 327 | 4.05 | 4.07 | 2.31 |
0.29 | Premium | I | VS2 | 62.4 | 58 | 334 | 4.20 | 4.23 | 2.63 |
0.31 | Good | J | SI2 | 63.3 | 58 | 335 | 4.34 | 4.35 | 2.75 |
0.24 | Very Good | J | VVS2 | 62.8 | 57 | 336 | 3.94 | 3.96 | 2.48 |
tibble(
a=lubridate::now()+runif(1e3)*86400,
b=lubridate::today()+runif(1e3)*30,
c=1:1e3,
d=runif(1e3),
e=sample(letters,1e3,replace=TRUE)
)
a | b | c | d | e |
---|---|---|---|---|
<dttm> | <date> | <int> | <dbl> | <chr> |
2022-10-02 01:17:43 | 2022-10-23 | 1 | 0.07904540 | d |
2022-10-01 22:59:44 | 2022-10-16 | 2 | 0.17864565 | z |
2022-10-01 23:43:46 | 2022-10-08 | 3 | 0.86078870 | d |
2022-10-02 03:40:45 | 2022-10-04 | 4 | 0.93952259 | g |
2022-10-01 23:50:48 | 2022-10-12 | 5 | 0.87904425 | k |
2022-10-02 08:52:52 | 2022-10-22 | 6 | 0.01717623 | y |
2022-10-01 23:04:13 | 2022-10-20 | 7 | 0.61590930 | s |
2022-10-02 07:41:45 | 2022-10-02 | 8 | 0.15905475 | t |
2022-10-02 03:38:31 | 2022-10-06 | 9 | 0.28964310 | e |
2022-10-02 03:33:58 | 2022-10-05 | 10 | 0.73699888 | y |
2022-10-02 03:26:09 | 2022-10-05 | 11 | 0.34331203 | t |
2022-10-02 11:56:58 | 2022-10-24 | 12 | 0.56826309 | s |
2022-10-02 05:23:19 | 2022-10-14 | 13 | 0.10237698 | g |
2022-10-02 14:05:24 | 2022-10-05 | 14 | 0.03664026 | k |
2022-10-02 12:35:50 | 2022-10-30 | 15 | 0.49635435 | t |
2022-10-02 02:42:11 | 2022-10-25 | 16 | 0.01659913 | c |
2022-10-02 05:53:12 | 2022-10-05 | 17 | 0.63373399 | w |
2022-10-02 08:18:19 | 2022-10-27 | 18 | 0.01377930 | l |
2022-10-02 17:44:45 | 2022-10-17 | 19 | 0.65322412 | v |
2022-10-01 22:21:33 | 2022-10-04 | 20 | 0.76266778 | k |
2022-10-02 11:31:59 | 2022-10-28 | 21 | 0.22273304 | f |
2022-10-02 19:58:45 | 2022-10-18 | 22 | 0.08504631 | m |
2022-10-02 15:35:07 | 2022-10-05 | 23 | 0.93193100 | i |
2022-10-02 05:57:29 | 2022-10-21 | 24 | 0.82549381 | y |
2022-10-02 03:49:29 | 2022-10-04 | 25 | 0.87586050 | g |
2022-10-02 16:10:10 | 2022-10-03 | 26 | 0.18427842 | f |
2022-10-01 23:03:23 | 2022-10-10 | 27 | 0.19592865 | e |
2022-10-02 14:11:55 | 2022-10-03 | 28 | 0.20376826 | v |
2022-10-02 05:42:00 | 2022-10-01 | 29 | 0.71602347 | u |
2022-10-02 14:37:36 | 2022-10-11 | 30 | 0.47020862 | u |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2022-10-02 13:04:54 | 2022-10-25 | 971 | 0.86671718 | j |
2022-10-01 21:17:47 | 2022-10-07 | 972 | 0.56333952 | d |
2022-10-02 00:42:52 | 2022-10-18 | 973 | 0.53318103 | r |
2022-10-02 07:44:26 | 2022-10-06 | 974 | 0.01763443 | c |
2022-10-02 00:33:14 | 2022-10-26 | 975 | 0.93499874 | l |
2022-10-02 04:32:36 | 2022-10-22 | 976 | 0.28361392 | q |
2022-10-01 23:27:32 | 2022-10-01 | 977 | 0.27174637 | e |
2022-10-02 17:35:56 | 2022-10-04 | 978 | 0.84402890 | g |
2022-10-02 09:59:34 | 2022-10-04 | 979 | 0.69759029 | g |
2022-10-02 07:21:53 | 2022-10-02 | 980 | 0.26624991 | r |
2022-10-02 15:04:21 | 2022-10-29 | 981 | 0.35530486 | a |
2022-10-02 00:08:30 | 2022-10-13 | 982 | 0.05575729 | l |
2022-10-02 02:10:33 | 2022-10-07 | 983 | 0.68215359 | m |
2022-10-02 01:44:38 | 2022-10-01 | 984 | 0.69344671 | g |
2022-10-02 02:07:59 | 2022-10-05 | 985 | 0.93304451 | k |
2022-10-02 12:21:03 | 2022-10-01 | 986 | 0.06716917 | i |
2022-10-01 23:56:20 | 2022-10-29 | 987 | 0.36384079 | w |
2022-10-02 11:39:18 | 2022-10-12 | 988 | 0.76277196 | i |
2022-10-02 17:49:47 | 2022-10-08 | 989 | 0.16140199 | j |
2022-10-02 13:45:43 | 2022-10-11 | 990 | 0.17523457 | x |
2022-10-02 03:13:52 | 2022-10-15 | 991 | 0.93648914 | c |
2022-10-02 06:18:18 | 2022-10-03 | 992 | 0.49585062 | h |
2022-10-02 03:02:46 | 2022-10-17 | 993 | 0.66820877 | p |
2022-10-02 09:21:35 | 2022-10-03 | 994 | 0.25612216 | h |
2022-10-02 02:01:02 | 2022-10-10 | 995 | 0.62971457 | a |
2022-10-02 05:38:54 | 2022-10-04 | 996 | 0.23914501 | x |
2022-10-02 17:17:02 | 2022-10-02 | 997 | 0.64705285 | p |
2022-10-02 09:45:24 | 2022-10-23 | 998 | 0.19531533 | j |
2022-10-02 10:20:00 | 2022-10-11 | 999 | 0.75877675 | s |
2022-10-02 19:40:23 | 2022-10-15 | 1000 | 0.89462929 | p |
提取子集
如$
和[[
,[[
能够按照名称或位置提取变量,$
只能按照名称提取变量。
<-tibble(
dfx = runif(5),
y = rnorm(5))
$x df
- 0.252373256953433
- 0.348186866613105
- 0.497539844829589
- 0.601780914701521
- 0.887235613539815
"x"]] df[[
- 0.252373256953433
- 0.348186866613105
- 0.497539844829589
- 0.601780914701521
- 0.887235613539815
与旧代码交互
class(as.data.frame(df))
使用readr数据导入
library(tidyverse)
readr中的多数函数用于平面文件转换为数据框。 - read_csv()
- read_fwf()
读取固定宽度的文件。
<-read_csv("/Users/a182501/R_data-mining/M2_IFI_Data_Basket.csv") heights
Rows: 1000 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (14): Payment, Gender, Tenant, Fruits & vegetables, Meat, Milk products,...
dbl (4): Card num., Amount, Income, Age
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
heights
Card num. | Amount | Payment | Gender | Tenant | Income | Age | Fruits & vegetables | Meat | Milk products | Canned vegetables | Canned meat | Frozen goods | Beer | Wine | Soda drinks | Fish | Textile |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<dbl> | <dbl> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
39808 | 427.12 | Cheque | M | Yes | 270000 | 46 | No | Yes | Yes | No | No | No | No | No | No | No | Yes |
67362 | 253.56 | Cash | F | Yes | 300000 | 28 | No | Yes | No | No | No | No | No | No | No | No | Yes |
10872 | 206.17 | Cash | M | Yes | 132000 | 36 | No | No | No | Yes | No | Yes | Yes | No | No | Yes | No |
26748 | 236.88 | Card | F | Yes | 122000 | 26 | No | No | Yes | No | No | No | No | Yes | No | No | No |
91609 | 188.13 | Card | M | No | 110000 | 24 | No | No | No | No | No | No | No | No | No | No | No |
26630 | 464.86 | Card | F | Yes | 150000 | 35 | No | Yes | No | No | No | No | No | Yes | No | Yes | No |
62995 | 140.46 | Cash | F | No | 208000 | 30 | Yes | No | No | No | No | No | No | No | Yes | No | No |
38765 | 222.03 | Cash | M | No | 244000 | 22 | No | No | No | No | No | No | Yes | No | No | No | No |
28935 | 229.75 | Cheque | F | Yes | 295000 | 46 | Yes | No | No | No | No | Yes | No | No | No | No | No |
41792 | 145.69 | Cash | M | Yes | 296000 | 22 | Yes | No | No | No | No | No | No | No | No | Yes | No |
59480 | 103.28 | Cash | F | Yes | 271000 | 18 | Yes | Yes | Yes | Yes | No | No | No | Yes | No | Yes | No |
60755 | 137.79 | Cash | F | No | 200000 | 48 | Yes | No | No | No | No | No | No | No | No | Yes | No |
70998 | 365.09 | Card | M | No | 273000 | 43 | No | No | Yes | No | Yes | Yes | No | No | No | Yes | No |
80617 | 102.01 | Cheque | F | No | 280000 | 43 | No | No | No | No | No | No | No | No | Yes | Yes | No |
61144 | 103.73 | Cash | F | Yes | 274000 | 24 | Yes | No | Yes | No | No | No | No | No | Yes | Yes | No |
36405 | 348.22 | Cheque | F | No | 184000 | 19 | No | No | No | No | No | Yes | Yes | No | Yes | No | No |
76567 | 422.48 | Card | M | No | 231000 | 31 | Yes | No | No | Yes | No | No | No | No | No | Yes | No |
85699 | 181.68 | Cash | F | No | 270000 | 29 | No | No | No | No | No | No | No | No | No | Yes | No |
11357 | 107.53 | Cash | F | No | 231000 | 26 | No | No | No | No | No | No | Yes | No | No | Yes | No |
97761 | 323.18 | Card | F | No | 258000 | 38 | Yes | No | No | Yes | No | No | No | Yes | No | Yes | Yes |
20362 | 317.20 | Cash | M | No | 251000 | 38 | No | No | No | No | No | Yes | No | No | No | Yes | No |
33173 | 368.32 | Cash | F | No | 247000 | 43 | No | No | No | No | No | No | No | Yes | No | No | Yes |
69934 | 311.78 | Cheque | F | No | 213000 | 41 | No | No | No | No | No | No | No | No | No | Yes | No |
14743 | 216.81 | Cash | M | No | 124000 | 48 | No | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | No | No |
83071 | 298.53 | Cash | M | No | 181000 | 31 | No | No | No | No | No | No | Yes | No | No | No | Yes |
17571 | 152.70 | Card | F | No | 229000 | 23 | No | No | Yes | No | No | Yes | Yes | No | No | No | No |
37917 | 322.31 | Cheque | F | Yes | 270000 | 32 | No | Yes | No | No | No | No | No | Yes | No | No | Yes |
11236 | 425.66 | Card | M | No | 268000 | 34 | No | No | No | No | No | No | No | No | No | No | No |
47914 | 445.91 | Cash | F | No | 247000 | 32 | No | Yes | No | No | No | No | No | Yes | No | Yes | Yes |
58154 | 491.36 | Cheque | M | Yes | 213000 | 50 | No | No | No | No | Yes | No | No | No | No | No | No |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
47330 | 209.34 | Cash | F | No | 188000 | 21 | Yes | No | No | No | No | Yes | No | Yes | No | No | Yes |
94864 | 434.74 | Cheque | F | Yes | 142000 | 47 | No | No | No | No | No | No | No | Yes | No | No | No |
28959 | 215.85 | Cash | M | Yes | 236000 | 38 | No | No | No | No | No | Yes | No | No | No | No | No |
54462 | 388.38 | Cash | M | Yes | 159000 | 31 | No | No | Yes | Yes | No | Yes | Yes | Yes | No | No | No |
63880 | 167.47 | Cash | F | Yes | 208000 | 23 | Yes | No | No | No | Yes | No | No | No | No | Yes | No |
104630 | 496.88 | Card | F | No | 238000 | 19 | Yes | No | No | No | No | No | No | Yes | No | No | Yes |
11565 | 449.74 | Cash | F | Yes | 164000 | 19 | No | Yes | Yes | Yes | No | No | No | Yes | No | Yes | No |
93527 | 333.61 | Cash | M | No | 297000 | 38 | No | No | No | No | No | No | Yes | Yes | No | No | No |
88812 | 288.76 | Card | F | Yes | 105000 | 34 | No | No | No | No | No | No | Yes | No | No | No | No |
61107 | 303.14 | Card | F | No | 211000 | 30 | Yes | No | Yes | No | No | No | No | No | No | No | No |
104722 | 336.67 | Card | M | No | 167000 | 38 | No | No | No | Yes | No | Yes | Yes | No | No | No | No |
10915 | 124.62 | Card | F | Yes | 135000 | 22 | Yes | No | Yes | No | Yes | Yes | No | No | No | Yes | No |
33056 | 144.01 | Cheque | M | Yes | 235000 | 27 | No | No | No | No | No | No | No | No | No | No | No |
78967 | 361.66 | Cheque | M | No | 145000 | 25 | No | No | No | Yes | No | Yes | Yes | No | No | No | No |
81825 | 250.71 | Card | F | Yes | 123000 | 21 | Yes | No | Yes | No | No | No | No | No | No | Yes | No |
51846 | 370.74 | Card | F | Yes | 140000 | 21 | Yes | No | No | No | No | No | No | No | No | Yes | No |
58288 | 144.81 | Card | M | Yes | 160000 | 24 | No | No | No | No | No | No | No | No | No | No | No |
30459 | 326.75 | Cheque | F | Yes | 205000 | 29 | No | No | No | Yes | No | No | No | Yes | No | Yes | Yes |
70018 | 427.99 | Cheque | F | No | 258000 | 39 | Yes | No | Yes | No | No | Yes | Yes | Yes | No | Yes | Yes |
62789 | 151.60 | Cash | M | Yes | 140000 | 25 | Yes | No | No | Yes | No | Yes | Yes | No | No | No | No |
102887 | 333.15 | Cheque | M | Yes | 203000 | 46 | Yes | No | No | No | No | Yes | No | No | No | No | No |
83102 | 480.07 | Cash | M | No | 146000 | 47 | No | No | No | Yes | No | Yes | Yes | No | No | Yes | No |
57268 | 250.26 | Cash | M | No | 220000 | 25 | Yes | Yes | No | No | No | No | No | No | Yes | No | No |
78139 | 485.57 | Cash | M | Yes | 252000 | 29 | Yes | No | No | Yes | Yes | No | No | No | No | No | No |
86331 | 454.52 | Card | M | Yes | 103000 | 29 | No | No | No | Yes | No | Yes | Yes | No | No | Yes | Yes |
31384 | 372.05 | Card | M | Yes | 275000 | 27 | No | No | No | Yes | No | No | No | No | No | No | No |
63996 | 137.35 | Cheque | F | Yes | 226000 | 23 | Yes | No | No | Yes | No | No | No | No | No | Yes | No |
99025 | 290.79 | Card | M | No | 274000 | 42 | No | Yes | No | No | No | No | No | No | No | No | No |
95921 | 348.57 | Cash | F | No | 233000 | 43 | Yes | No | No | No | No | No | No | Yes | No | No | Yes |
99164 | 306.96 | Cash | M | Yes | 216000 | 26 | No | No | Yes | No | No | No | No | No | Yes | No | Yes |
read_csv("a,b,c
1,2,3
4,5,6")
Rows: 2 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (3): a, b, c
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
a | b | c |
---|---|---|
<dbl> | <dbl> | <dbl> |
1 | 2 | 3 |
4 | 5 | 6 |
read_csv("the first line of metadata
the second line of metadata
1,2,3",skip=2)
Rows: 0 Columns: 3
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): 1, 2, 3
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
1 | 2 | 3 |
---|---|---|
<chr> | <chr> | <chr> |
readr中skip
能够将前几行的数据直接跳过
parse_double("1.2")
readr中使用了“地区”这一个概念,能够按照不同的地区解析选项的一个对象。
parse_double("1,23",locale = locale(decimal_mark = ","))
parse_number("20%")
parse_number
能够忽略其他非数值型字符。可用于处理百分比或货币
使用lubridate处理时间
创建时间或日期
library(tidyverse)
library(lubridate)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ tibble 3.1.8 ✔ purrr 0.3.4
✔ tidyr 1.2.0 ✔ stringr 1.4.1
✔ readr 2.1.2 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ stats::filter() masks dplyr::filter()
✖ stats::lag() masks dplyr::lag()
Attachement du package : ‘lubridate’
Les objets suivants sont masqués depuis ‘package:base’:
date, intersect, setdiff, union
today()
now()
[1] "2022-10-09 14:02:14 CST"
通过字符串创建
ymd("2017-10-10")
mdy("January 31st,2017")
ymd(20170131)
加入一个时区参数
ymd(20170101,tz="UTC")
[1] "2017-01-01 UTC"
日期时间成分
<- ymd_hms("2016-10-11 12:34:11") datetime
year(datetime)
mday(datetime)
yday(datetime)
mday(datetime)
mday()
一个月中的第几天;yday()
一年中的第几天;wday()
一周中的第几天
重要的原子向量
逻辑型
1:10%%3==0
- FALSE
- FALSE
- TRUE
- FALSE
- FALSE
- TRUE
- FALSE
- FALSE
- TRUE
- FALSE
数值型
typeof(1)
typeof(1L)
<-sqrt(2)^2 x
typeof(x)
字符型
缺失值
强制转换
as.logical()
、as.integer()
、as.double()
等函数转换方式,就是一种强制转换
<- sample(20,100,replace=TRUE) x
<- x>10 y
sum(y)
mean(y)
typeof(c(TRUE,1L))
检验函数
sample(10)+100
- 102
- 101
- 108
- 104
- 107
- 103
- 105
- 109
- 110
- 106
runif(10)>0.5
- FALSE
- FALSE
- TRUE
- TRUE
- FALSE
- TRUE
- TRUE
- FALSE
- TRUE
- FALSE
使用purrr
实现迭代
for
循环
<- tibble(
df a= rnorm(10),
b= rnorm(10),
c= rnorm(10),
d = rnorm(10)
)
df
a | b | c | d |
---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> |
0.8970158 | 0.6873088 | 0.80024365 | -0.98162763 |
-0.5751092 | -0.3848371 | 1.86547931 | -0.79731893 |
-0.5440100 | 0.9645361 | -0.74678144 | -1.06925662 |
-1.7279895 | -1.3180739 | -0.07265774 | -0.04647237 |
-0.8031987 | -0.1179136 | 2.07089258 | -0.41219837 |
0.3596368 | -0.4623563 | -0.04209510 | 1.24028689 |
0.6522909 | 0.4926086 | -2.17560267 | 1.26102775 |
-1.8867870 | -0.8284376 | 0.15331049 | -1.54542523 |
-2.4719750 | 0.5141981 | 1.98788444 | -0.04804680 |
-1.4517999 | 0.6235135 | -1.96672755 | -1.47042800 |
c(mean(df$a),mean(df$b))
- -0.755192578727417
- 0.0170546566497491
每个循环都包含三个部分。
输出:output<-vector("double",length(x))
开始循环时候,必须为输出结果分配足够的时间,对循环效率来说比较重要,若在每次迭代时候,都是用c()
来保存循环结果,for
循环就非常慢
序列:i in seq_along(df)
循环体: output[[i]]<-median(df[[i]])
<-function(x){
rescale01<-range(x,na.rm=TRUE)
rng-rng[1])/(rng[2]-rng[1])
(x }
$a<-rescale01(df$a) df
$a df
- 1
- 0.563036810301723
- 0.572267809015891
- 0.220833333612345
- 0.495334167740919
- 0.840492593468615
- 0.927359592117168
- 0.173698292087717
- 0
- 0.302813271637925
for (i in seq_along(df)){
<-rescale01(df[[i]])
df[[i]] }
note:循环列表或数据框,使用[[
循环的方式: - 元素循环 - 名称循环
使用索引值进行循环是最常用的方式
未知长度的输出
<-c(0,1,2)
means <-double() output
for (i in seq_along(means)){
<- sample(100,1)
n <- c(output,rnorm(n,means[[i]]))
output
}str(output)
num [1:253] -0.366 0.595 -0.816 -0.907 -1.424 ...
for 与函数编程
<- tibble(
df a= rnorm(10),
b= rnorm(10),
c= rnorm(10),
d= rnorm(10)
)
<- vector("double",length(df)) output
for (i in seq_along(df)){
=mean(df[[i]])
output[[i]] }
output
- -0.392677235398266
- -0.000158837329926254
- -0.465850203489892
- 0.344197800836397
<-function(df,fun){
col_summary<-vector("double",length(df))
outfor (i in seq_along(df)){
<-fun(df[[i]])
out[i]
}
out }
col_summary(df,median)
- -0.512714109408632
- -0.111337745611366
- -0.45908034745605
- 0.433442364331742
col_summary(df,mean)
- -0.392677235398266
- -0.000158837329926254
- -0.465850203489892
- 0.344197800836397
将函数作为参数传入。
映射函数
先对于向量进行映射,再对于每个元素进行处理,最后保存结果。 - map()
用于输出列表; - map_lgl()
用于输出逻辑型向量; - map_int()
用于输出整型向量;
library("purrr")
map_dbl(df,mean)
- a
- -0.392677235398266
- b
- -0.000158837329926254
- c
- -0.465850203489892
- d
- 0.344197800836397
使用dplyr
处理关系数据
键:用于处理连接每对数据表的变量称为键。键是唯一能够标识观测的变量。单一变量能够标识一个规则。 - 主键:唯一标识所在数据表中的观测; - 外键:唯一标识另一个数据表的观测
library("nycflights13")
planes
tailnum | year | type | manufacturer | model | engines | seats | speed | engine |
---|---|---|---|---|---|---|---|---|
<chr> | <int> | <chr> | <chr> | <chr> | <int> | <int> | <int> | <chr> |
N10156 | 2004 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N102UW | 1998 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N103US | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N104UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N10575 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145LR | 2 | 55 | NA | Turbo-fan |
N105UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N107US | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N108UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N109UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N110UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
N11106 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11107 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11109 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11113 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11119 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11121 | 2003 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11127 | 2003 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11137 | 2003 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11140 | 2003 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11150 | 2003 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11155 | 2004 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11164 | 2004 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11165 | 2004 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11176 | 2004 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11181 | 2005 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11184 | 2005 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11187 | 2005 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11189 | 2005 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11191 | 2005 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
N11192 | 2005 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
N984DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N985AT | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N985DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N986AT | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N986DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N987AT | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N987DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N988AT | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N988DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N989AT | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N989DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N990AT | 2001 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N990DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N991AT | NA | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N991DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N992AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N992DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N993AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N993DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N994AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N994DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS CORPORATION | MD-88 | 2 | 142 | NA | Turbo-jet |
N995AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N995DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N996AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N996DL | 1991 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N997AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N997DL | 1992 | Fixed wing multi engine | MCDONNELL DOUGLAS AIRCRAFT CO | MD-88 | 2 | 142 | NA | Turbo-fan |
N998AT | 2002 | Fixed wing multi engine | BOEING | 717-200 | 2 | 100 | NA | Turbo-fan |
N998DL | 1992 | Fixed wing multi engine | MCDONNELL DOUGLAS CORPORATION | MD-88 | 2 | 142 | NA | Turbo-jet |
N999DN | 1992 | Fixed wing multi engine | MCDONNELL DOUGLAS CORPORATION | MD-88 | 2 | 142 | NA | Turbo-jet |
%>%
planes count(tailnum)%>%
::filter(n>1) dplyr
tailnum | n |
---|---|
<chr> | <int> |
合并连接
可以将两个表格中的变量进行组合,先通过两个表格的键进行匹配观测,然后在将一个表格中的变量复制到表格中。
和mutate()
函数一样,链接函数也会将变量添加到表格右侧。
head(flights)
year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <dbl> | <int> | <int> | <dbl> | <chr> | <int> | <chr> | <chr> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dttm> |
2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 |
2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 |
library(nycflights13)
<-flights %>%
flights2select(year:day,hour,origin,dest,tailnum,carrier)
flights2
year | month | day | hour | origin | dest | tailnum | carrier |
---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <dbl> | <chr> | <chr> | <chr> | <chr> |
2013 | 1 | 1 | 5 | EWR | IAH | N14228 | UA |
2013 | 1 | 1 | 5 | LGA | IAH | N24211 | UA |
2013 | 1 | 1 | 5 | JFK | MIA | N619AA | AA |
2013 | 1 | 1 | 5 | JFK | BQN | N804JB | B6 |
2013 | 1 | 1 | 6 | LGA | ATL | N668DN | DL |
2013 | 1 | 1 | 5 | EWR | ORD | N39463 | UA |
2013 | 1 | 1 | 6 | EWR | FLL | N516JB | B6 |
2013 | 1 | 1 | 6 | LGA | IAD | N829AS | EV |
2013 | 1 | 1 | 6 | JFK | MCO | N593JB | B6 |
2013 | 1 | 1 | 6 | LGA | ORD | N3ALAA | AA |
2013 | 1 | 1 | 6 | JFK | PBI | N793JB | B6 |
2013 | 1 | 1 | 6 | JFK | TPA | N657JB | B6 |
2013 | 1 | 1 | 6 | JFK | LAX | N29129 | UA |
2013 | 1 | 1 | 6 | EWR | SFO | N53441 | UA |
2013 | 1 | 1 | 6 | LGA | DFW | N3DUAA | AA |
2013 | 1 | 1 | 5 | JFK | BOS | N708JB | B6 |
2013 | 1 | 1 | 6 | EWR | LAS | N76515 | UA |
2013 | 1 | 1 | 6 | LGA | FLL | N595JB | B6 |
2013 | 1 | 1 | 6 | LGA | ATL | N542MQ | MQ |
2013 | 1 | 1 | 6 | EWR | PBI | N644JB | B6 |
2013 | 1 | 1 | 6 | LGA | MSP | N971DL | DL |
2013 | 1 | 1 | 6 | LGA | DTW | N730MQ | MQ |
2013 | 1 | 1 | 6 | EWR | MIA | N633AA | AA |
2013 | 1 | 1 | 6 | JFK | ATL | N3739P | DL |
2013 | 1 | 1 | 6 | EWR | MIA | N53442 | UA |
2013 | 1 | 1 | 6 | EWR | ORD | N9EAMQ | MQ |
2013 | 1 | 1 | 6 | JFK | SFO | N532UA | UA |
2013 | 1 | 1 | 6 | JFK | RSW | N635JB | B6 |
2013 | 1 | 1 | 6 | JFK | SJU | N794JB | B6 |
2013 | 1 | 1 | 6 | EWR | ATL | N326NB | DL |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 9 | 30 | 21 | LGA | CHO | N712EV | EV |
2013 | 9 | 30 | 21 | EWR | CLT | N16546 | EV |
2013 | 9 | 30 | 21 | JFK | DEN | N807JB | B6 |
2013 | 9 | 30 | 20 | LGA | RIC | N751EV | EV |
2013 | 9 | 30 | 21 | JFK | DCA | N807MQ | MQ |
2013 | 9 | 30 | 21 | JFK | LAX | N335AA | AA |
2013 | 9 | 30 | 21 | EWR | PWM | N12957 | EV |
2013 | 9 | 30 | 21 | JFK | SJU | N633JB | B6 |
2013 | 9 | 30 | 21 | LGA | FLL | N627JB | B6 |
2013 | 9 | 30 | 21 | EWR | BOS | N813UA | UA |
2013 | 9 | 30 | 21 | EWR | MHT | N10575 | EV |
2013 | 9 | 30 | 18 | JFK | BUF | N906XJ | 9E |
2013 | 9 | 30 | 22 | LGA | BGR | N722EV | EV |
2013 | 9 | 30 | 21 | LGA | BNA | N532MQ | MQ |
2013 | 9 | 30 | 20 | EWR | STL | N12145 | EV |
2013 | 9 | 30 | 22 | JFK | PWM | N193JB | B6 |
2013 | 9 | 30 | 21 | EWR | SFO | N578UA | UA |
2013 | 9 | 30 | 20 | JFK | MCO | N804JB | B6 |
2013 | 9 | 30 | 22 | JFK | BTV | N318JB | B6 |
2013 | 9 | 30 | 22 | JFK | SYR | N354JB | B6 |
2013 | 9 | 30 | 22 | JFK | BUF | N281JB | B6 |
2013 | 9 | 30 | 22 | JFK | ROC | N346JB | B6 |
2013 | 9 | 30 | 22 | JFK | BOS | N565JB | B6 |
2013 | 9 | 30 | 23 | JFK | PSE | N516JB | B6 |
2013 | 9 | 30 | 18 | LGA | BNA | N740EV | EV |
2013 | 9 | 30 | 14 | JFK | DCA | NA | 9E |
2013 | 9 | 30 | 22 | LGA | SYR | NA | 9E |
2013 | 9 | 30 | 12 | LGA | BNA | N535MQ | MQ |
2013 | 9 | 30 | 11 | LGA | CLE | N511MQ | MQ |
2013 | 9 | 30 | 8 | LGA | RDU | N839MQ | MQ |
通过mutate()
来实现变量的生成,在使用match()
来进行变量的匹配
$name[12] airlines
match()
返回的是airlines$carrier
的位置
%>%
flights select(-origin,-dest)%>%
mutate(name=airlines$name[match(carrier,airlines$carrier)])
year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | air_time | distance | hour | minute | time_hour | name |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <dbl> | <int> | <int> | <dbl> | <chr> | <int> | <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dttm> | <chr> |
2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 | American Airlines Inc. |
2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 | JetBlue Airways |
2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 | Delta Air Lines Inc. |
2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 555 | 600 | -5 | 913 | 854 | 19 | B6 | 507 | N516JB | 158 | 1065 | 6 | 0 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 557 | 600 | -3 | 709 | 723 | -14 | EV | 5708 | N829AS | 53 | 229 | 6 | 0 | 2013-01-01 06:00:00 | ExpressJet Airlines Inc. |
2013 | 1 | 1 | 557 | 600 | -3 | 838 | 846 | -8 | B6 | 79 | N593JB | 140 | 944 | 6 | 0 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 558 | 600 | -2 | 753 | 745 | 8 | AA | 301 | N3ALAA | 138 | 733 | 6 | 0 | 2013-01-01 06:00:00 | American Airlines Inc. |
2013 | 1 | 1 | 558 | 600 | -2 | 849 | 851 | -2 | B6 | 49 | N793JB | 149 | 1028 | 6 | 0 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 558 | 600 | -2 | 853 | 856 | -3 | B6 | 71 | N657JB | 158 | 1005 | 6 | 0 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 558 | 600 | -2 | 924 | 917 | 7 | UA | 194 | N29129 | 345 | 2475 | 6 | 0 | 2013-01-01 06:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 558 | 600 | -2 | 923 | 937 | -14 | UA | 1124 | N53441 | 361 | 2565 | 6 | 0 | 2013-01-01 06:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 559 | 600 | -1 | 941 | 910 | 31 | AA | 707 | N3DUAA | 257 | 1389 | 6 | 0 | 2013-01-01 06:00:00 | American Airlines Inc. |
2013 | 1 | 1 | 559 | 559 | 0 | 702 | 706 | -4 | B6 | 1806 | N708JB | 44 | 187 | 5 | 59 | 2013-01-01 05:00:00 | JetBlue Airways |
2013 | 1 | 1 | 559 | 600 | -1 | 854 | 902 | -8 | UA | 1187 | N76515 | 337 | 2227 | 6 | 0 | 2013-01-01 06:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 600 | 600 | 0 | 851 | 858 | -7 | B6 | 371 | N595JB | 152 | 1076 | 6 | 0 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 600 | 600 | 0 | 837 | 825 | 12 | MQ | 4650 | N542MQ | 134 | 762 | 6 | 0 | 2013-01-01 06:00:00 | Envoy Air |
2013 | 1 | 1 | 601 | 600 | 1 | 844 | 850 | -6 | B6 | 343 | N644JB | 147 | 1023 | 6 | 0 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 602 | 610 | -8 | 812 | 820 | -8 | DL | 1919 | N971DL | 170 | 1020 | 6 | 10 | 2013-01-01 06:00:00 | Delta Air Lines Inc. |
2013 | 1 | 1 | 602 | 605 | -3 | 821 | 805 | 16 | MQ | 4401 | N730MQ | 105 | 502 | 6 | 5 | 2013-01-01 06:00:00 | Envoy Air |
2013 | 1 | 1 | 606 | 610 | -4 | 858 | 910 | -12 | AA | 1895 | N633AA | 152 | 1085 | 6 | 10 | 2013-01-01 06:00:00 | American Airlines Inc. |
2013 | 1 | 1 | 606 | 610 | -4 | 837 | 845 | -8 | DL | 1743 | N3739P | 128 | 760 | 6 | 10 | 2013-01-01 06:00:00 | Delta Air Lines Inc. |
2013 | 1 | 1 | 607 | 607 | 0 | 858 | 915 | -17 | UA | 1077 | N53442 | 157 | 1085 | 6 | 7 | 2013-01-01 06:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 608 | 600 | 8 | 807 | 735 | 32 | MQ | 3768 | N9EAMQ | 139 | 719 | 6 | 0 | 2013-01-01 06:00:00 | Envoy Air |
2013 | 1 | 1 | 611 | 600 | 11 | 945 | 931 | 14 | UA | 303 | N532UA | 366 | 2586 | 6 | 0 | 2013-01-01 06:00:00 | United Air Lines Inc. |
2013 | 1 | 1 | 613 | 610 | 3 | 925 | 921 | 4 | B6 | 135 | N635JB | 175 | 1074 | 6 | 10 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 615 | 615 | 0 | 1039 | 1100 | -21 | B6 | 709 | N794JB | 182 | 1598 | 6 | 15 | 2013-01-01 06:00:00 | JetBlue Airways |
2013 | 1 | 1 | 615 | 615 | 0 | 833 | 842 | -9 | DL | 575 | N326NB | 120 | 746 | 6 | 15 | 2013-01-01 06:00:00 | Delta Air Lines Inc. |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2013 | 9 | 30 | 2123 | 2125 | -2 | 2223 | 2247 | -24 | EV | 5489 | N712EV | 45 | 305 | 21 | 25 | 2013-09-30 21:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2127 | 2129 | -2 | 2314 | 2323 | -9 | EV | 3833 | N16546 | 72 | 529 | 21 | 29 | 2013-09-30 21:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2128 | 2130 | -2 | 2328 | 2359 | -31 | B6 | 97 | N807JB | 213 | 1626 | 21 | 30 | 2013-09-30 21:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2129 | 2059 | 30 | 2230 | 2232 | -2 | EV | 5048 | N751EV | 45 | 292 | 20 | 59 | 2013-09-30 20:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2131 | 2140 | -9 | 2225 | 2255 | -30 | MQ | 3621 | N807MQ | 36 | 213 | 21 | 40 | 2013-09-30 21:00:00 | Envoy Air |
2013 | 9 | 30 | 2140 | 2140 | 0 | 10 | 40 | -30 | AA | 185 | N335AA | 298 | 2475 | 21 | 40 | 2013-09-30 21:00:00 | American Airlines Inc. |
2013 | 9 | 30 | 2142 | 2129 | 13 | 2250 | 2239 | 11 | EV | 4509 | N12957 | 47 | 284 | 21 | 29 | 2013-09-30 21:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2145 | 2145 | 0 | 115 | 140 | -25 | B6 | 1103 | N633JB | 192 | 1598 | 21 | 45 | 2013-09-30 21:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2147 | 2137 | 10 | 30 | 27 | 3 | B6 | 1371 | N627JB | 139 | 1076 | 21 | 37 | 2013-09-30 21:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2149 | 2156 | -7 | 2245 | 2308 | -23 | UA | 523 | N813UA | 37 | 200 | 21 | 56 | 2013-09-30 21:00:00 | United Air Lines Inc. |
2013 | 9 | 30 | 2150 | 2159 | -9 | 2250 | 2306 | -16 | EV | 3842 | N10575 | 39 | 209 | 21 | 59 | 2013-09-30 21:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2159 | 1845 | 194 | 2344 | 2030 | 194 | 9E | 3320 | N906XJ | 50 | 301 | 18 | 45 | 2013-09-30 18:00:00 | Endeavor Air Inc. |
2013 | 9 | 30 | 2203 | 2205 | -2 | 2339 | 2331 | 8 | EV | 5311 | N722EV | 61 | 378 | 22 | 5 | 2013-09-30 22:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2207 | 2140 | 27 | 2257 | 2250 | 7 | MQ | 3660 | N532MQ | 97 | 764 | 21 | 40 | 2013-09-30 21:00:00 | Envoy Air |
2013 | 9 | 30 | 2211 | 2059 | 72 | 2339 | 2242 | 57 | EV | 4672 | N12145 | 120 | 872 | 20 | 59 | 2013-09-30 20:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | 2231 | 2245 | -14 | 2335 | 2356 | -21 | B6 | 108 | N193JB | 48 | 273 | 22 | 45 | 2013-09-30 22:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2233 | 2113 | 80 | 112 | 30 | 42 | UA | 471 | N578UA | 318 | 2565 | 21 | 13 | 2013-09-30 21:00:00 | United Air Lines Inc. |
2013 | 9 | 30 | 2235 | 2001 | 154 | 59 | 2249 | 130 | B6 | 1083 | N804JB | 123 | 944 | 20 | 1 | 2013-09-30 20:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2237 | 2245 | -8 | 2345 | 2353 | -8 | B6 | 234 | N318JB | 43 | 266 | 22 | 45 | 2013-09-30 22:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2240 | 2245 | -5 | 2334 | 2351 | -17 | B6 | 1816 | N354JB | 41 | 209 | 22 | 45 | 2013-09-30 22:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2240 | 2250 | -10 | 2347 | 7 | -20 | B6 | 2002 | N281JB | 52 | 301 | 22 | 50 | 2013-09-30 22:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2241 | 2246 | -5 | 2345 | 1 | -16 | B6 | 486 | N346JB | 47 | 264 | 22 | 46 | 2013-09-30 22:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2307 | 2255 | 12 | 2359 | 2358 | 1 | B6 | 718 | N565JB | 33 | 187 | 22 | 55 | 2013-09-30 22:00:00 | JetBlue Airways |
2013 | 9 | 30 | 2349 | 2359 | -10 | 325 | 350 | -25 | B6 | 745 | N516JB | 196 | 1617 | 23 | 59 | 2013-09-30 23:00:00 | JetBlue Airways |
2013 | 9 | 30 | NA | 1842 | NA | NA | 2019 | NA | EV | 5274 | N740EV | NA | 764 | 18 | 42 | 2013-09-30 18:00:00 | ExpressJet Airlines Inc. |
2013 | 9 | 30 | NA | 1455 | NA | NA | 1634 | NA | 9E | 3393 | NA | NA | 213 | 14 | 55 | 2013-09-30 14:00:00 | Endeavor Air Inc. |
2013 | 9 | 30 | NA | 2200 | NA | NA | 2312 | NA | 9E | 3525 | NA | NA | 198 | 22 | 0 | 2013-09-30 22:00:00 | Endeavor Air Inc. |
2013 | 9 | 30 | NA | 1210 | NA | NA | 1330 | NA | MQ | 3461 | N535MQ | NA | 764 | 12 | 10 | 2013-09-30 12:00:00 | Envoy Air |
2013 | 9 | 30 | NA | 1159 | NA | NA | 1344 | NA | MQ | 3572 | N511MQ | NA | 419 | 11 | 59 | 2013-09-30 11:00:00 | Envoy Air |
2013 | 9 | 30 | NA | 840 | NA | NA | 1020 | NA | MQ | 3531 | N839MQ | NA | 431 | 8 | 40 | 2013-09-30 08:00:00 | Envoy Air |
head(airlines)
carrier | name |
---|---|
<chr> | <chr> |
9E | Endeavor Air Inc. |
AA | American Airlines Inc. |
AS | Alaska Airlines Inc. |
B6 | JetBlue Airways |
DL | Delta Air Lines Inc. |
EV | ExpressJet Airlines Inc. |
理解连接
匹配就是实现在两行之间的交集。
内连接
内连接最简单的一种连接,只要两个观测的键是相等的。内连接就可以进行匹配。
字符串组合
组合两个或更多的字符串可以使用str_c()
函数
library(stringr)
str_c("x","y")
基础匹配
<- c("apple","banana","pear") x
str_view(x,"an")
匹配检测
str_detect(x,"an")
- FALSE
- TRUE
- FALSE
sum(str_detect(words,"^t"))
有多少个以t开头的常用单词
words
- 'a'
- 'able'
- 'about'
- 'absolute'
- 'accept'
- 'account'
- 'achieve'
- 'across'
- 'act'
- 'active'
- 'actual'
- 'add'
- 'address'
- 'admit'
- 'advertise'
- 'affect'
- 'afford'
- 'after'
- 'afternoon'
- 'again'
- 'against'
- 'age'
- 'agent'
- 'ago'
- 'agree'
- 'air'
- 'all'
- 'allow'
- 'almost'
- 'along'
- 'already'
- 'alright'
- 'also'
- 'although'
- 'always'
- 'america'
- 'amount'
- 'and'
- 'another'
- 'answer'
- 'any'
- 'apart'
- 'apparent'
- 'appear'
- 'apply'
- 'appoint'
- 'approach'
- 'appropriate'
- 'area'
- 'argue'
- 'arm'
- 'around'
- 'arrange'
- 'art'
- 'as'
- 'ask'
- 'associate'
- 'assume'
- 'at'
- 'attend'
- 'authority'
- 'available'
- 'aware'
- 'away'
- 'awful'
- 'baby'
- 'back'
- 'bad'
- 'bag'
- 'balance'
- 'ball'
- 'bank'
- 'bar'
- 'base'
- 'basis'
- 'be'
- 'bear'
- 'beat'
- 'beauty'
- 'because'
- 'become'
- 'bed'
- 'before'
- 'begin'
- 'behind'
- 'believe'
- 'benefit'
- 'best'
- 'bet'
- 'between'
- 'big'
- 'bill'
- 'birth'
- 'bit'
- 'black'
- 'bloke'
- 'blood'
- 'blow'
- 'blue'
- 'board'
- 'boat'
- 'body'
- 'book'
- 'both'
- 'bother'
- 'bottle'
- 'bottom'
- 'box'
- 'boy'
- 'break'
- 'brief'
- 'brilliant'
- 'bring'
- 'britain'
- 'brother'
- 'budget'
- 'build'
- 'bus'
- 'business'
- 'busy'
- 'but'
- 'buy'
- 'by'
- 'cake'
- 'call'
- 'can'
- 'car'
- 'card'
- 'care'
- 'carry'
- 'case'
- 'cat'
- 'catch'
- 'cause'
- 'cent'
- 'centre'
- 'certain'
- 'chair'
- 'chairman'
- 'chance'
- 'change'
- 'chap'
- 'character'
- 'charge'
- 'cheap'
- 'check'
- 'child'
- 'choice'
- 'choose'
- 'Christ'
- 'Christmas'
- 'church'
- 'city'
- 'claim'
- 'class'
- 'clean'
- 'clear'
- 'client'
- 'clock'
- 'close'
- 'closes'
- 'clothe'
- 'club'
- 'coffee'
- 'cold'
- 'colleague'
- 'collect'
- 'college'
- 'colour'
- 'come'
- 'comment'
- 'commit'
- 'committee'
- 'common'
- 'community'
- 'company'
- 'compare'
- 'complete'
- 'compute'
- 'concern'
- 'condition'
- 'confer'
- 'consider'
- 'consult'
- 'contact'
- 'continue'
- 'contract'
- 'control'
- 'converse'
- 'cook'
- 'copy'
- 'corner'
- 'correct'
- 'cost'
- 'could'
- 'council'
- 'count'
- 'country'
- 'county'
- 'couple'
- ⋯
- 'society'
- 'some'
- 'son'
- 'soon'
- 'sorry'
- 'sort'
- 'sound'
- 'south'
- 'space'
- 'speak'
- 'special'
- 'specific'
- 'speed'
- 'spell'
- 'spend'
- 'square'
- 'staff'
- 'stage'
- 'stairs'
- 'stand'
- 'standard'
- 'start'
- 'state'
- 'station'
- 'stay'
- 'step'
- 'stick'
- 'still'
- 'stop'
- 'story'
- 'straight'
- 'strategy'
- 'street'
- 'strike'
- 'strong'
- 'structure'
- 'student'
- 'study'
- 'stuff'
- 'stupid'
- 'subject'
- 'succeed'
- 'such'
- 'sudden'
- 'suggest'
- 'suit'
- 'summer'
- 'sun'
- 'sunday'
- 'supply'
- 'support'
- 'suppose'
- 'sure'
- 'surprise'
- 'switch'
- 'system'
- 'table'
- 'take'
- 'talk'
- 'tape'
- 'tax'
- 'tea'
- 'teach'
- 'team'
- 'telephone'
- 'television'
- 'tell'
- 'ten'
- 'tend'
- 'term'
- 'terrible'
- 'test'
- 'than'
- 'thank'
- 'the'
- 'then'
- 'there'
- 'therefore'
- 'they'
- 'thing'
- 'think'
- 'thirteen'
- 'thirty'
- 'this'
- 'thou'
- 'though'
- 'thousand'
- 'three'
- 'through'
- 'throw'
- 'thursday'
- 'tie'
- 'time'
- 'to'
- 'today'
- 'together'
- 'tomorrow'
- 'tonight'
- 'too'
- 'top'
- 'total'
- 'touch'
- 'toward'
- 'town'
- 'trade'
- 'traffic'
- 'train'
- 'transport'
- 'travel'
- 'treat'
- 'tree'
- 'trouble'
- 'true'
- 'trust'
- 'try'
- 'tuesday'
- 'turn'
- 'twelve'
- 'twenty'
- 'two'
- 'type'
- 'under'
- 'understand'
- 'union'
- 'unit'
- 'unite'
- 'university'
- 'unless'
- 'until'
- 'up'
- 'upon'
- 'use'
- 'usual'
- 'value'
- 'various'
- 'very'
- 'video'
- 'view'
- 'village'
- 'visit'
- 'vote'
- 'wage'
- 'wait'
- 'walk'
- 'wall'
- 'want'
- 'war'
- 'warm'
- 'wash'
- 'waste'
- 'watch'
- 'water'
- 'way'
- 'we'
- 'wear'
- 'wednesday'
- 'wee'
- 'week'
- 'weigh'
- 'welcome'
- 'well'
- 'west'
- 'what'
- 'when'
- 'where'
- 'whether'
- 'which'
- 'while'
- 'white'
- 'who'
- 'whole'
- 'why'
- 'wide'
- 'wife'
- 'will'
- 'win'
- 'wind'
- 'window'
- 'wish'
- 'with'
- 'within'
- 'without'
- 'woman'
- 'wonder'
- 'wood'
- 'word'
- 'work'
- 'world'
- 'worry'
- 'worse'
- 'worth'
- 'would'
- 'write'
- 'wrong'
- 'year'
- 'yes'
- 'yesterday'
- 'yet'
- 'you'
- 'young'
length(sentences)
head(sentences)
- 'The birch canoe slid on the smooth planks.'
- 'Glue the sheet to the dark blue background.'
- 'It\'s easy to tell the depth of a well.'
- 'These days a chicken leg is a rare dish.'
- 'Rice is often served in round bowls.'
- 'The juice of lemons makes fine punch.'
用forecast处理因子
library(tidyverse)
library(forcats)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ tibble 3.1.8 ✔ purrr 0.3.4
✔ tidyr 1.2.0 ✔ forcats 0.5.1
✔ readr 2.1.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ stats::filter() masks dplyr::filter()
✖ stats::lag() masks dplyr::lag()
<- c("dec","apr","jan","mar") x1
<- c("dec","apr","jam","mar") x2
sort(x1)
- 'apr'
- 'dec'
- 'jan'
- 'mar'
<-c("jan", "feb", "mar", "apr", "may","jun","jul","aug","sep","oct","nov","dec") month_levels
<- factor(x1,levels=month_levels) y1
sort(y1)
- jan
- mar
- apr
- dec
Levels:
- 'jan'
- 'feb'
- 'mar'
- 'apr'
- 'may'
- 'jun'
- 'jul'
- 'aug'
- 'sep'
- 'oct'
- 'nov'
- 'dec'
<-factor(x2,levels=month_levels) y2
y2
- dec
- apr
- <NA>
- mar
Levels:
- 'jan'
- 'feb'
- 'mar'
- 'apr'
- 'may'
- 'jun'
- 'jul'
- 'aug'
- 'sep'
- 'oct'
- 'nov'
- 'dec'
gss_cat
year | marital | age | race | rincome | partyid | relig | denom | tvhours |
---|---|---|---|---|---|---|---|---|
<int> | <fct> | <int> | <fct> | <fct> | <fct> | <fct> | <fct> | <int> |
2000 | Never married | 26 | White | $8000 to 9999 | Ind,near rep | Protestant | Southern baptist | 12 |
2000 | Divorced | 48 | White | $8000 to 9999 | Not str republican | Protestant | Baptist-dk which | NA |
2000 | Widowed | 67 | White | Not applicable | Independent | Protestant | No denomination | 2 |
2000 | Never married | 39 | White | Not applicable | Ind,near rep | Orthodox-christian | Not applicable | 4 |
2000 | Divorced | 25 | White | Not applicable | Not str democrat | None | Not applicable | 1 |
2000 | Married | 25 | White | $20000 - 24999 | Strong democrat | Protestant | Southern baptist | NA |
2000 | Never married | 36 | White | $25000 or more | Not str republican | Christian | Not applicable | 3 |
2000 | Divorced | 44 | White | $7000 to 7999 | Ind,near dem | Protestant | Lutheran-mo synod | NA |
2000 | Married | 44 | White | $25000 or more | Not str democrat | Protestant | Other | 0 |
2000 | Married | 47 | White | $25000 or more | Strong republican | Protestant | Southern baptist | 3 |
2000 | Married | 53 | White | $25000 or more | Not str democrat | Protestant | Other | 2 |
2000 | Married | 52 | White | $25000 or more | Ind,near rep | None | Not applicable | NA |
2000 | Married | 52 | White | $25000 or more | Strong democrat | Protestant | Southern baptist | 1 |
2000 | Married | 51 | White | $25000 or more | Strong republican | Protestant | United methodist | NA |
2000 | Divorced | 52 | White | $25000 or more | Ind,near dem | None | Not applicable | 1 |
2000 | Married | 40 | Black | $25000 or more | Strong democrat | Protestant | Baptist-dk which | 7 |
2000 | Widowed | 77 | White | Not applicable | Strong republican | Jewish | Not applicable | NA |
2000 | Never married | 44 | White | $25000 or more | Independent | None | Not applicable | 3 |
2000 | Married | 40 | White | $10000 - 14999 | Not str democrat | Catholic | Not applicable | 3 |
2000 | Married | 45 | Black | Not applicable | Independent | Protestant | United methodist | NA |
2000 | Married | 48 | White | $25000 or more | Ind,near dem | Catholic | Not applicable | 1 |
2000 | Married | 49 | White | Refused | Strong republican | Protestant | United methodist | 2 |
2000 | Never married | 19 | White | Not applicable | Independent | None | Not applicable | 2 |
2000 | Widowed | 54 | White | $25000 or more | Ind,near rep | Christian | Not applicable | 1 |
2000 | Widowed | 82 | White | Not applicable | Not str democrat | Protestant | Other | 3 |
2000 | Widowed | 83 | White | Not applicable | Strong democrat | Protestant | Episcopal | NA |
2000 | Widowed | 89 | White | Not applicable | Not str democrat | Protestant | Other lutheran | 4 |
2000 | Widowed | 88 | White | Not applicable | Strong republican | Protestant | Afr meth ep zion | NA |
2000 | Divorced | 72 | White | Not applicable | Strong democrat | Protestant | Southern baptist | 7 |
2000 | Widowed | 82 | White | Not applicable | Independent | Protestant | Am bapt ch in usa | NA |
⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ | ⋮ |
2014 | Divorced | 38 | White | $3000 to 3999 | Not str republican | Protestant | Other | 1 |
2014 | Widowed | 46 | White | $25000 or more | Strong democrat | None | Not applicable | 2 |
2014 | Married | 49 | White | Not applicable | Ind,near rep | Protestant | Other | 6 |
2014 | Never married | 34 | White | $25000 or more | Independent | Protestant | United methodist | 2 |
2014 | Married | 54 | White | Not applicable | Independent | Protestant | Other | NA |
2014 | Married | 34 | White | $15000 - 19999 | Ind,near dem | Buddhism | Not applicable | 1 |
2014 | Married | 69 | White | Not applicable | Ind,near dem | Jewish | Not applicable | 3 |
2014 | Divorced | 36 | White | Not applicable | Independent | None | Not applicable | 0 |
2014 | Married | 65 | White | $25000 or more | Not str democrat | None | Not applicable | 2 |
2014 | Married | 48 | White | $20000 - 24999 | Strong democrat | Protestant | Other | 0 |
2014 | Married | 38 | White | $10000 - 14999 | Not str democrat | Protestant | No denomination | 2 |
2014 | Never married | 30 | White | $4000 to 4999 | Ind,near dem | None | Not applicable | 2 |
2014 | Married | 48 | White | $8000 to 9999 | Not str republican | Catholic | Not applicable | 0 |
2014 | Divorced | 49 | White | $25000 or more | Ind,near rep | Other | Not applicable | 2 |
2014 | Married | 54 | White | $25000 or more | Ind,near dem | Protestant | Other | NA |
2014 | Married | 49 | White | $25000 or more | Not str republican | Catholic | Not applicable | NA |
2014 | Married | 53 | White | $25000 or more | Not str democrat | None | Not applicable | 0 |
2014 | Married | 52 | White | $25000 or more | Not str democrat | None | Not applicable | 1 |
2014 | Widowed | 82 | White | Not applicable | Strong democrat | Protestant | Other | 2 |
2014 | Married | 63 | White | Not applicable | Ind,near dem | No answer | No answer | 2 |
2014 | Divorced | 54 | White | $25000 or more | Ind,near rep | Catholic | Not applicable | 3 |
2014 | Married | 62 | White | $25000 or more | Ind,near rep | Protestant | Other | NA |
2014 | Never married | 40 | White | $1000 to 2999 | Not str republican | None | Not applicable | 2 |
2014 | Married | 33 | White | Not applicable | Independent | Christian | No denomination | 0 |
2014 | Widowed | 75 | White | Don't know | Strong republican | Protestant | Baptist-dk which | 4 |
2014 | Widowed | 89 | White | Not applicable | Not str republican | Protestant | United methodist | 3 |
2014 | Divorced | 56 | White | $25000 or more | Independent | None | Not applicable | 4 |
2014 | Never married | 24 | White | $10000 - 14999 | Ind,near dem | None | Not applicable | 4 |
2014 | Never married | 27 | White | $25000 or more | Not str democrat | Catholic | Not applicable | NA |
2014 | Widowed | 71 | White | $20000 - 24999 | Ind,near rep | Protestant | Other | 2 |
%>%
gss_catcount(race)
race | n |
---|---|
<fct> | <int> |
Other | 1959 |
Black | 3129 |
White | 16395 |
ggplot(gss_cat,aes(race))+
geom_bar()
修改因子水平
修改水平可以将图形标签更清晰美观,以满足要求。 最常用的工具是fct_recode()
函数,对每个水平进行修改或重新编码
%>%count(partyid) gss_cat
partyid | n |
---|---|
<fct> | <int> |
No answer | 154 |
Don't know | 1 |
Other party | 393 |
Strong republican | 2314 |
Not str republican | 3032 |
Ind,near rep | 1791 |
Independent | 4119 |
Ind,near dem | 2499 |
Not str democrat | 3690 |
Strong democrat | 3490 |
%>%mutate(paryid=fct_recode(partyid,
gss_cat" ))%>%count(partyid)
模型
一个简单的模型
library(modelr)
options(na.action=na.warn)
ggplot(sim1,aes(x,y))+
geom_point()
使用geom_abline()
作为接受斜率和截距的参数。
<-tibble(
modelsa1=runif(250,-20,40),
a2=runif(250,-5,5)
)
ggplot(sim1,aes(x,y))+
geom_abline(
aes(intercept=a1,slope=a2),
data=models,alpha=1/4)+
geom_point()
<-function(a,data){
model11]+data$x*a[2]
a[
}model1(c(7,1.5),sim1)
- 8.5
- 8.5
- 8.5
- 10
- 10
- 10
- 11.5
- 11.5
- 11.5
- 13
- 13
- 13
- 14.5
- 14.5
- 14.5
- 16
- 16
- 16
- 17.5
- 17.5
- 17.5
- 19
- 19
- 19
- 20.5
- 20.5
- 20.5
- 22
- 22
- 22
lm(sim1$y~sim1$x)
Call:
lm(formula = sim1$y ~ sim1$x)
Coefficients:
(Intercept) sim1$x
4.221 2.052
计算RMSE
<-function(mod,data){
measure_distance<-data$y-model1(mod,data)
diffsqrt(mean(diff^2))
measure_distance(c(7,1.5),sim1)
ggplot(sim1,aes(x,y))+
geom_abline(
aes(intercept=4.221,slope=2.052),
data=models,alpha=1/4)+
geom_point()
<-function(a1,a2){
sim1_distmeasure_distance(c(a1,a2),sim1)
}<- models%>%
modelsmutate(dist=purrr::map2_dbl(a1,a2,sim1_dist))
models
a1 | a2 | dist |
---|---|---|
<dbl> | <dbl> | <dbl> |
25.8939742 | 4.12527708 | 33.678048 |
27.2648124 | -2.16334729 | 12.292729 |
16.4143964 | 2.39385045 | 14.270203 |
-13.9463849 | 2.67610426 | 14.992703 |
17.3968330 | 3.66866673 | 22.653911 |
-13.1896603 | 3.44623747 | 10.744162 |
26.5019563 | -3.12030982 | 16.223277 |
8.6455384 | -3.30895119 | 29.487148 |
5.4990308 | -3.50583991 | 33.422655 |
21.5492480 | 4.16330328 | 29.648394 |
18.8202022 | 3.37972138 | 22.335762 |
-4.3836592 | 2.46169729 | 6.798642 |
0.4930800 | -2.57624917 | 32.135936 |
23.5192233 | 2.02614316 | 19.276733 |
3.7050483 | -3.01259516 | 31.951123 |
-13.6400263 | -4.65592229 | 58.081509 |
12.0623313 | -1.62711042 | 16.422826 |
-8.6369843 | -0.04256578 | 25.196531 |
5.2793188 | 1.08157663 | 5.529676 |
0.7286800 | 0.20832003 | 14.775987 |
22.7578678 | 4.63078748 | 33.618505 |
22.4392158 | 3.28054053 | 25.315759 |
20.3574835 | 3.61268840 | 25.216336 |
34.3444000 | -4.06730493 | 18.051918 |
3.6715133 | -3.53752878 | 35.231382 |
-2.3516494 | -3.74150671 | 41.935432 |
2.6082739 | 1.13104592 | 7.488525 |
0.2349872 | 2.80973265 | 3.050534 |
-3.7599907 | 3.65673924 | 5.148361 |
-12.1784598 | 2.99719831 | 11.717719 |
⋮ | ⋮ | ⋮ |
-18.25429630 | 1.0628285 | 28.137680 |
28.47720882 | -1.4273278 | 11.428741 |
-1.69409033 | -2.3813102 | 32.931184 |
-11.12420415 | -1.7351186 | 37.831324 |
-6.89637993 | -0.7360245 | 27.715933 |
14.06403183 | -3.1103001 | 23.839734 |
26.15645752 | -0.5099415 | 10.965506 |
30.70797157 | -1.9196064 | 12.498618 |
34.76040949 | -3.6252376 | 16.457746 |
2.33044673 | -0.1417064 | 15.456574 |
10.33221718 | -2.5549683 | 23.434356 |
7.01245683 | 0.7628761 | 6.056801 |
-16.02814821 | 1.9522219 | 20.905743 |
29.04988048 | 3.9941010 | 36.011765 |
28.84471810 | 0.5693023 | 17.145523 |
12.01410521 | 3.4039627 | 15.862597 |
-0.01231243 | -4.1196387 | 42.142797 |
-2.27586546 | -4.7444047 | 48.067782 |
19.81414753 | -2.5600360 | 16.596301 |
-12.22730105 | -0.1983924 | 29.614875 |
2.01877660 | 2.5634932 | 2.658604 |
10.00248966 | -4.8671421 | 37.958676 |
-1.44763710 | 3.4218653 | 4.848906 |
-0.46312617 | 4.0839558 | 8.988031 |
-10.91154613 | -4.8939085 | 56.981015 |
9.77806405 | -3.7767585 | 31.415544 |
26.40300828 | 1.1381687 | 17.488052 |
-5.05738873 | -1.4701986 | 30.455595 |
-3.32008007 | 2.4790783 | 5.741700 |
13.22413116 | 2.0478461 | 9.231690 |
ggplot(sim1,aes(x,y))+
geom_point(size=2,color="grey30")+
geom_abline(
aes(intercept=a1,slope=a2,color=-dist),
data=dplyr::filter(models,rank(dist)<=10)
)
<-optim(c(0,0),measure_distance,data=sim1)
best$par best
- 4.22224779961462
- 2.05120381317836
lm(sim1$y~sim1$x)
Call:
lm(formula = sim1$y ~ sim1$x)
Coefficients:
(Intercept) sim1$x
4.221 2.052
ggplot(sim1,aes(x,y))+
geom_abline(
aes(intercept=best$par[1],slope=best$par[2]),
data=models,alpha=1/4)+
geom_point()
残差
<-lm(sim1$y~sim1$x)
m1 df
<-sim1%>%
sim1add_residuals(sim1)
sim1