4. Subsetting a data.table
\[
DT[i, j, by] \\
\begin{aligned}
i&: \text{filters rows (analagous to a WHERE clause in SQL)} \\
j&: \text{is how you select/compute on columns} \\
by&: \text{is where you can group computations} \end{aligned}
\]
1) \(i\)
- first index is ALWAYS rows, regardless of commas
identical(flights[2], flights[2,])
## [1] TRUE
as.data.frame(flights)[2] %>% dim
## [1] 5819079 1
as.data.frame(flights)[2,] %>% dim
## [1] 1 31
flights[1:2] # first and second row
## YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER
## 1: 2015 1 1 4 AS 98 N407AS
## 2: 2015 1 1 4 AA 2336 N3KUAA
## ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME
## 1: ANC SEA 5 2354
## 2: LAX PBI 10 2
## DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 1: -11 21 15 205 194
## 2: -8 12 14 280 279
## AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME
## 1: 169 1448 404 4 430 408
## 2: 263 2330 737 4 750 741
## ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY
## 1: -22 0 0 NA
## 2: -9 0 0 NA
## SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 1: NA NA NA NA
## 2: NA NA NA NA
flights[2:1] # second and first row
## YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER
## 1: 2015 1 1 4 AA 2336 N3KUAA
## 2: 2015 1 1 4 AS 98 N407AS
## ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME
## 1: LAX PBI 10 2
## 2: ANC SEA 5 2354
## DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 1: -8 12 14 280 279
## 2: -11 21 15 205 194
## AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME
## 1: 263 2330 737 4 750 741
## 2: 169 1448 404 4 430 408
## ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY
## 1: -9 0 0 NA
## 2: -22 0 0 NA
## SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 1: NA NA NA NA
## 2: NA NA NA NA
flights[MONTH == 1]
flights[MONTH == 1 & DISTANCE > 1000]
identical(flights[-1:-100000], flights[!1:100000])
## [1] TRUE
order()
: optimized within a data.table
- makes sorting a cinch
- sort by one column and then another
- minus (‘-’) means descending order
flights[order(TAIL_NUMBER,-ARRIVAL_DELAY)] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
F9 |
865 |
|
MSO |
DEN |
600 |
NA |
NA |
NA |
NA |
120 |
NA |
NA |
679 |
NA |
NA |
800 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
F9 |
1256 |
|
ORD |
MIA |
600 |
NA |
NA |
NA |
NA |
180 |
NA |
NA |
1197 |
NA |
NA |
1000 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
UA |
641 |
|
LAX |
HNL |
845 |
NA |
NA |
NA |
NA |
344 |
NA |
NA |
2556 |
NA |
NA |
1229 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
UA |
1412 |
|
ORD |
PHL |
1010 |
NA |
NA |
NA |
NA |
121 |
NA |
NA |
678 |
NA |
NA |
1311 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
US |
1883 |
|
SLC |
PHL |
1030 |
NA |
NA |
NA |
NA |
243 |
NA |
NA |
1927 |
NA |
NA |
1633 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
F9 |
1257 |
|
MIA |
ORD |
1050 |
NA |
NA |
NA |
NA |
200 |
NA |
NA |
1197 |
NA |
NA |
1310 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
sort by a function on our data
flights[order(substr(ORIGIN_AIRPORT,1,1))] %>% head %>% knitr::kable()
2015 |
10 |
1 |
4 |
AA |
1230 |
N3DBAA |
14747 |
11298 |
5 |
15 |
10 |
15 |
30 |
237 |
208 |
189 |
1660 |
539 |
4 |
602 |
543 |
-19 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
10 |
1 |
4 |
DL |
1805 |
N696DL |
14771 |
13487 |
5 |
16 |
11 |
14 |
30 |
213 |
192 |
171 |
1589 |
521 |
7 |
538 |
528 |
-10 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
10 |
1 |
4 |
NK |
612 |
N602NK |
12889 |
13487 |
5 |
2400 |
-5 |
15 |
15 |
177 |
168 |
149 |
1299 |
444 |
4 |
502 |
448 |
-14 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
10 |
1 |
4 |
AA |
260 |
N3GNAA |
12892 |
13303 |
10 |
7 |
-3 |
28 |
35 |
296 |
306 |
271 |
2342 |
806 |
7 |
806 |
813 |
7 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
10 |
1 |
4 |
AA |
1982 |
N914UY |
14771 |
11057 |
10 |
8 |
-2 |
12 |
20 |
291 |
282 |
257 |
2296 |
737 |
13 |
801 |
750 |
-11 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
10 |
1 |
4 |
UA |
684 |
N495UA |
14771 |
13930 |
14 |
13 |
-1 |
12 |
25 |
261 |
229 |
212 |
1846 |
557 |
5 |
635 |
602 |
-33 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
flights[!is.na(ARRIVAL_DELAY)][order(TAIL_NUMBER,-ARRIVAL_DELAY)] %>% head() %>% knitr::kable()
2015 |
9 |
25 |
5 |
WN |
2736 |
7819A |
DSM |
MDW |
550 |
1321 |
451 |
11 |
1332 |
80 |
65 |
50 |
306 |
1422 |
4 |
710 |
1426 |
436 |
0 |
0 |
|
0 |
0 |
436 |
0 |
0 |
2015 |
8 |
5 |
3 |
WN |
815 |
7819A |
GEG |
DEN |
540 |
1250 |
430 |
8 |
1258 |
130 |
121 |
106 |
836 |
1544 |
7 |
850 |
1551 |
421 |
0 |
0 |
|
0 |
0 |
421 |
0 |
0 |
2015 |
11 |
21 |
6 |
WN |
2279 |
7819A |
ALB |
MCO |
1805 |
2324 |
319 |
11 |
2335 |
190 |
180 |
164 |
1073 |
219 |
5 |
2115 |
224 |
309 |
0 |
0 |
|
0 |
0 |
25 |
284 |
0 |
2015 |
11 |
21 |
6 |
WN |
2279 |
7819A |
MDW |
ALB |
1450 |
1932 |
282 |
30 |
2002 |
105 |
116 |
83 |
717 |
2225 |
3 |
1735 |
2228 |
293 |
0 |
0 |
|
11 |
0 |
0 |
245 |
37 |
2015 |
7 |
22 |
3 |
WN |
840 |
7819A |
PHL |
MCO |
1650 |
1943 |
173 |
10 |
1953 |
155 |
129 |
112 |
861 |
2145 |
7 |
1925 |
2152 |
147 |
0 |
0 |
|
0 |
0 |
76 |
71 |
0 |
2015 |
7 |
22 |
3 |
WN |
4636 |
7819A |
MCO |
RIC |
2000 |
2225 |
145 |
9 |
2234 |
110 |
105 |
91 |
667 |
5 |
5 |
2150 |
10 |
140 |
0 |
0 |
|
0 |
0 |
0 |
140 |
0 |
flights %>%
.[!is.na(ARRIVAL_DELAY)] %>%
.[order(TAIL_NUMBER,-ARRIVAL_DELAY)] %>% head() %>% knitr::kable()
2015 |
9 |
25 |
5 |
WN |
2736 |
7819A |
DSM |
MDW |
550 |
1321 |
451 |
11 |
1332 |
80 |
65 |
50 |
306 |
1422 |
4 |
710 |
1426 |
436 |
0 |
0 |
|
0 |
0 |
436 |
0 |
0 |
2015 |
8 |
5 |
3 |
WN |
815 |
7819A |
GEG |
DEN |
540 |
1250 |
430 |
8 |
1258 |
130 |
121 |
106 |
836 |
1544 |
7 |
850 |
1551 |
421 |
0 |
0 |
|
0 |
0 |
421 |
0 |
0 |
2015 |
11 |
21 |
6 |
WN |
2279 |
7819A |
ALB |
MCO |
1805 |
2324 |
319 |
11 |
2335 |
190 |
180 |
164 |
1073 |
219 |
5 |
2115 |
224 |
309 |
0 |
0 |
|
0 |
0 |
25 |
284 |
0 |
2015 |
11 |
21 |
6 |
WN |
2279 |
7819A |
MDW |
ALB |
1450 |
1932 |
282 |
30 |
2002 |
105 |
116 |
83 |
717 |
2225 |
3 |
1735 |
2228 |
293 |
0 |
0 |
|
11 |
0 |
0 |
245 |
37 |
2015 |
7 |
22 |
3 |
WN |
840 |
7819A |
PHL |
MCO |
1650 |
1943 |
173 |
10 |
1953 |
155 |
129 |
112 |
861 |
2145 |
7 |
1925 |
2152 |
147 |
0 |
0 |
|
0 |
0 |
76 |
71 |
0 |
2015 |
7 |
22 |
3 |
WN |
4636 |
7819A |
MCO |
RIC |
2000 |
2225 |
145 |
9 |
2234 |
110 |
105 |
91 |
667 |
5 |
5 |
2150 |
10 |
140 |
0 |
0 |
|
0 |
0 |
0 |
140 |
0 |
2) \(j\)
# select columns by name in the second index parameter
flights[ , DESTINATION_AIRPORT] %>% head() %>% knitr::kable() # as a vector
flights[ , list(DESTINATION_AIRPORT)] %>% head() %>% knitr::kable() # as a data.table
- use list to select multiple columns
flights[ , list(DESTINATION_AIRPORT, ORIGIN_AIRPORT)] %>% head() %>% knitr::kable()
SEA |
ANC |
PBI |
LAX |
CLT |
SFO |
MIA |
LAX |
ANC |
SEA |
MSP |
SFO |
.()
is the same as list()
in data.table
flights[ , .(DESTINATION_AIRPORT, ORIGIN_AIRPORT)] %>% head() %>% knitr::kable()
SEA |
ANC |
PBI |
LAX |
CLT |
SFO |
MIA |
LAX |
ANC |
SEA |
MSP |
SFO |
- also computable on columns
flights[ , unique(ORIGIN_AIRPORT)] %>% head() %>% knitr::kable()
flights[ , sum(AIR_TIME, na.rm = TRUE) / (1e6 * 60)] %>% head() %>% knitr::kable()
flights[ , summary(as.factor(CANCELLATION_REASON))] %>% head() %>% knitr::kable()
|
5729195 |
A |
25262 |
B |
48851 |
C |
15749 |
D |
22 |
flights[ , .(AIR_TIME, AIR_TIME/60)] %>% head() %>% knitr::kable()# return multiple columns
169 |
2.816667 |
263 |
4.383333 |
266 |
4.433333 |
258 |
4.300000 |
199 |
3.316667 |
206 |
3.433333 |
flights[ , .(AIR_TIME, AIR_TIME_HOURS = AIR_TIME/60)] %>% head() %>% knitr::kable()
169 |
2.816667 |
263 |
4.383333 |
266 |
4.433333 |
258 |
4.300000 |
199 |
3.316667 |
206 |
3.433333 |
Anytime the \(j\) expression returns a list, the whole expression will return a data.table
.
3) \(i\) and \(j\)
Q1. How many flights were delayed touching down?
flights[ARRIVAL_DELAY>0, .N]
## [1] 2086896
.N
is a convenience symbol to get the number of rows in a data.table
Q2. What are some summary statistics of the delayed flights (in hours)?
flights[ARRIVAL_DELAY>0, summary(ARRIVAL_DELAY/60)]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01667 0.10000 0.25000 0.55188 0.63333 32.85000
Q3. How many flights were on each day of the week?
flights[ARRIVAL_DELAY>0, plot(table(DAY_OF_WEEK))]
## [1] 1 2 3 4 5 6 7
- we don’t have to return a
data.table
- an expression that returns a list in \(j\) would, but here we want to plot, and
data.table
evaluates the expression correctly.
4) by
: grouping
Q4. How many outgoing flights did each aiport have?
flights[ , .N, by = ORIGIN_AIRPORT] %>% head() %>% knitr::kable()
ANC |
16005 |
LAX |
194673 |
SFO |
148008 |
SEA |
110899 |
LAS |
133181 |
DEN |
196055 |
- Order it to see the biggest airports
flights[ , .N, by = ORIGIN_AIRPORT][order(-N)] %>% head() %>% knitr::kable()
ATL |
346836 |
ORD |
285884 |
DFW |
239551 |
DEN |
196055 |
LAX |
194673 |
SFO |
148008 |
Q5. Which airports had the most cancelled flights?
flights[CANCELLED == 1, .N, by = ORIGIN_AIRPORT][order(-N)]
## ORIGIN_AIRPORT N
## 1: ORD 8548
## 2: DFW 6254
## 3: LGA 4531
## 4: EWR 3110
## 5: BOS 2654
## ---
## 521: 13931 1
## 522: 14730 1
## 523: 10158 1
## 524: 11097 1
## 525: 14986 1
flights %>%
extract(CANCELLED == 1, .N, by = ORIGIN_AIRPORT) %>%
extract(order(-N))
# Here we use extract, magrittr's alias for [ rather than the .[] notation above
Q6. What’s the min and max departure delay for each airport, sorted by biggest difference between the two?
flights[ , .(MinDelay = min(DEPARTURE_DELAY, na.rm = TRUE),
MaxDelay = max(DEPARTURE_DELAY, na.rm = TRUE)),
by = ORIGIN_AIRPORT][order(MinDelay-MaxDelay)] %>% head() %>% knitr::kable()
BHM |
-20 |
1988 |
RIC |
-23 |
1878 |
SAN |
-31 |
1670 |
ABQ |
-22 |
1649 |
DTW |
-25 |
1631 |
IND |
-25 |
1625 |
5) .SD
: a special name for the subset of the data.table for each group
mtcarsDT <- as.data.table(mtcars, keep.rownames = 'Model')
mtcarsDT[ ,unique(cyl)] # we have three groups by cylinder count
## [1] 6 4 8
mtcarsDT[ , print(.SD), by = cyl] # so we have three data.tables that get printed
## Model mpg disp hp drat wt qsec vs am gear carb
## 1: Mazda RX4 21.0 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2: Mazda RX4 Wag 21.0 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3: Hornet 4 Drive 21.4 258.0 110 3.08 3.215 19.44 1 0 3 1
## 4: Valiant 18.1 225.0 105 2.76 3.460 20.22 1 0 3 1
## 5: Merc 280 19.2 167.6 123 3.92 3.440 18.30 1 0 4 4
## 6: Merc 280C 17.8 167.6 123 3.92 3.440 18.90 1 0 4 4
## 7: Ferrari Dino 19.7 145.0 175 3.62 2.770 15.50 0 1 5 6
## Model mpg disp hp drat wt qsec vs am gear carb
## 1: Datsun 710 22.8 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2: Merc 240D 24.4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3: Merc 230 22.8 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4: Fiat 128 32.4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 5: Honda Civic 30.4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 6: Toyota Corolla 33.9 71.1 65 4.22 1.835 19.90 1 1 4 1
## 7: Toyota Corona 21.5 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8: Fiat X1-9 27.3 79.0 66 4.08 1.935 18.90 1 1 4 1
## 9: Porsche 914-2 26.0 120.3 91 4.43 2.140 16.70 0 1 5 2
## 10: Lotus Europa 30.4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 11: Volvo 142E 21.4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Model mpg disp hp drat wt qsec vs am gear carb
## 1: Hornet Sportabout 18.7 360.0 175 3.15 3.440 17.02 0 0 3 2
## 2: Duster 360 14.3 360.0 245 3.21 3.570 15.84 0 0 3 4
## 3: Merc 450SE 16.4 275.8 180 3.07 4.070 17.40 0 0 3 3
## 4: Merc 450SL 17.3 275.8 180 3.07 3.730 17.60 0 0 3 3
## 5: Merc 450SLC 15.2 275.8 180 3.07 3.780 18.00 0 0 3 3
## 6: Cadillac Fleetwood 10.4 472.0 205 2.93 5.250 17.98 0 0 3 4
## 7: Lincoln Continental 10.4 460.0 215 3.00 5.424 17.82 0 0 3 4
## 8: Chrysler Imperial 14.7 440.0 230 3.23 5.345 17.42 0 0 3 4
## 9: Dodge Challenger 15.5 318.0 150 2.76 3.520 16.87 0 0 3 2
## 10: AMC Javelin 15.2 304.0 150 3.15 3.435 17.30 0 0 3 2
## 11: Camaro Z28 13.3 350.0 245 3.73 3.840 15.41 0 0 3 4
## 12: Pontiac Firebird 19.2 400.0 175 3.08 3.845 17.05 0 0 3 2
## 13: Ford Pantera L 15.8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 14: Maserati Bora 15.0 301.0 335 3.54 3.570 14.60 0 1 5 8
## Empty data.table (0 rows and 1 cols): cyl
Ref: https://www.youtube.com/watch?v=L6aT_oEhIKo
Q7. First two rows of flights from each day
flights[ , .SD[1:2], by = .(YEAR,MONTH,DAY)] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
AS |
98 |
N407AS |
ANC |
SEA |
5 |
2354 |
-11 |
21 |
15 |
205 |
194 |
169 |
1448 |
404 |
4 |
430 |
408 |
-22 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
AA |
2336 |
N3KUAA |
LAX |
PBI |
10 |
2 |
-8 |
12 |
14 |
280 |
279 |
263 |
2330 |
737 |
4 |
750 |
741 |
-9 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
2 |
5 |
AA |
2400 |
N5CDAA |
LAX |
DFW |
5 |
2359 |
-6 |
22 |
21 |
170 |
174 |
147 |
1235 |
448 |
5 |
455 |
453 |
-2 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
2 |
5 |
AS |
98 |
N764AS |
ANC |
SEA |
5 |
2353 |
-12 |
18 |
11 |
205 |
190 |
168 |
1448 |
359 |
4 |
430 |
403 |
-27 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
3 |
6 |
AA |
2400 |
N5DKAA |
LAX |
DFW |
5 |
5 |
0 |
23 |
28 |
170 |
174 |
148 |
1235 |
456 |
3 |
455 |
459 |
4 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
3 |
6 |
US |
580 |
N117UW |
SLC |
PHL |
10 |
16 |
6 |
18 |
34 |
244 |
236 |
212 |
1927 |
606 |
6 |
614 |
612 |
-2 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
Q8. 100 random obs from every day
flights[ , .SD[sample(1:.N, 100)], by = .(YEAR, MONTH,DAY)] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
DL |
1353 |
N927DA |
PNS |
ATL |
1203 |
1202 |
-1 |
8 |
1210 |
75 |
62 |
41 |
271 |
1351 |
13 |
1418 |
1404 |
-14 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
EV |
6071 |
N18101 |
DEN |
SLC |
1105 |
1124 |
19 |
13 |
1137 |
95 |
74 |
55 |
391 |
1232 |
6 |
1240 |
1238 |
-2 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
DL |
1585 |
N918DL |
LGA |
MCO |
1730 |
1743 |
13 |
20 |
1803 |
188 |
165 |
135 |
950 |
2018 |
10 |
2038 |
2028 |
-10 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015 |
1 |
1 |
4 |
EV |
5967 |
N14116 |
BNA |
DEN |
1602 |
1618 |
16 |
13 |
1631 |
192 |
202 |
177 |
1014 |
1828 |
12 |
1814 |
1840 |
26 |
0 |
0 |
|
23 |
0 |
3 |
0 |
0 |
2015 |
1 |
1 |
4 |
WN |
1044 |
N8652B |
AUS |
HOU |
1430 |
1432 |
2 |
10 |
1442 |
50 |
66 |
30 |
148 |
1512 |
26 |
1520 |
1538 |
18 |
0 |
0 |
|
18 |
0 |
0 |
0 |
0 |
2015 |
1 |
1 |
4 |
WN |
910 |
N949WN |
PVD |
MDW |
1320 |
1322 |
2 |
8 |
1330 |
165 |
168 |
155 |
842 |
1505 |
5 |
1505 |
1510 |
5 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
Q8. % of flights cancelled by airport?
flights[ , .(PctCancelled = mean(CANCELLED) %>% round(., 3)), by = ORIGIN_AIRPORT][order(-PctCancelled)] %>% head() %>% knitr::kable()
ITH |
0.118 |
13964 |
0.111 |
MMH |
0.103 |
10170 |
0.097 |
SUN |
0.092 |
DVL |
0.088 |
Q9. Remove all airports from the dataset that don’t have 10k flights
flights[ , .SD[.N >= 10000], by = ORIGIN_AIRPORT] %>% head() %>% knitr::kable()
ANC |
2015 |
1 |
1 |
4 |
AS |
98 |
N407AS |
SEA |
5 |
2354 |
-11 |
21 |
15 |
205 |
194 |
169 |
1448 |
404 |
4 |
430 |
408 |
-22 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
ANC |
2015 |
1 |
1 |
4 |
AS |
108 |
N309AS |
SEA |
45 |
41 |
-4 |
17 |
58 |
204 |
194 |
173 |
1448 |
451 |
4 |
509 |
455 |
-14 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
ANC |
2015 |
1 |
1 |
4 |
DL |
1560 |
N3743H |
SEA |
45 |
31 |
-14 |
25 |
56 |
210 |
200 |
171 |
1448 |
447 |
4 |
515 |
451 |
-24 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
ANC |
2015 |
1 |
1 |
4 |
AS |
122 |
N413AS |
PDX |
50 |
46 |
-4 |
11 |
57 |
215 |
201 |
187 |
1542 |
504 |
3 |
525 |
507 |
-18 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
ANC |
2015 |
1 |
1 |
4 |
AS |
136 |
N431AS |
SEA |
135 |
NA |
NA |
NA |
NA |
205 |
NA |
NA |
1448 |
NA |
NA |
600 |
NA |
NA |
0 |
1 |
A |
NA |
NA |
NA |
NA |
NA |
ANC |
2015 |
1 |
1 |
4 |
US |
617 |
N804AW |
PHX |
152 |
143 |
-9 |
21 |
204 |
323 |
322 |
298 |
2552 |
902 |
3 |
915 |
905 |
-10 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
- Here we’re saying to group by the airport of origin
- So
.SD
in each case is the complete set of flights for an airport
- We then filter that subset. We put in a single
TRUE
or FALSE
, is the number of rows greater than 10000?
- If it is, it returns TRUE and is recycled so that every row in the subset is returned.
- If not, then the FALSE is returned, recycled for all rows, and no rows are returned.
data.table
will build up the final dataset based on these little data.tables
- Appending zero rows to a data.table means nothing, and the airports that don’t have enough rows are dropped from the final output.
Q10. Which airports don’t have 10000 flights?
flights[ , .SD[.N < 10000], by = ORIGIN_AIRPORT][ , unique(ORIGIN_AIRPORT)] %>% head()
## [1] "FAI" "PBG" "IAG" "PSE" "BQN" "GEG"
6) Assignment
- there is an assignment operator that modifies in place (by reference)
- jargon translation: it will alter the specific object you use it on
- Create a date column that’s in date format
flights[ , Date := ymd(YEAR*10000+MONTH*100+DAY)] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
AS |
98 |
N407AS |
ANC |
SEA |
5 |
2354 |
-11 |
21 |
15 |
205 |
194 |
169 |
1448 |
404 |
4 |
430 |
408 |
-22 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
2015 |
1 |
1 |
4 |
AA |
2336 |
N3KUAA |
LAX |
PBI |
10 |
2 |
-8 |
12 |
14 |
280 |
279 |
263 |
2330 |
737 |
4 |
750 |
741 |
-9 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
2015 |
1 |
1 |
4 |
US |
840 |
N171US |
SFO |
CLT |
20 |
18 |
-2 |
16 |
34 |
286 |
293 |
266 |
2296 |
800 |
11 |
806 |
811 |
5 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
2015 |
1 |
1 |
4 |
AA |
258 |
N3HYAA |
LAX |
MIA |
20 |
15 |
-5 |
15 |
30 |
285 |
281 |
258 |
2342 |
748 |
8 |
805 |
756 |
-9 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
2015 |
1 |
1 |
4 |
AS |
135 |
N527AS |
SEA |
ANC |
25 |
24 |
-1 |
11 |
35 |
235 |
215 |
199 |
1448 |
254 |
5 |
320 |
259 |
-21 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
2015 |
1 |
1 |
4 |
DL |
806 |
N3730B |
SFO |
MSP |
25 |
20 |
-5 |
18 |
38 |
217 |
230 |
206 |
1589 |
604 |
6 |
602 |
610 |
8 |
0 |
0 |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
- Reformat the dataset into boolean format
flights[ , CANCELLED := ifelse(CANCELLED == 1, TRUE, FALSE)]
flights[ , AvgSpeed := DISTANCE / (AIR_TIME/60)]
- Update the
data.table
selectively
flights[ CANCELLATION_REASON2 == 'A', CANCELLATION_REASON := 'Airline' ]
flights[ CANCELLATION_REASON2 == 'B', CANCELLATION_REASON := 'Weather']
flights[ CANCELLATION_REASON2 == 'C', CANCELLATION_REASON := 'National Air System']
flights[ CANCELLATION_REASON2 == 'D', CANCELLATION_REASON := 'Security']
flights[ CANCELLATION_REASON2 == '', CANCELLATION_REASON := NA]
flights[!is.na(CANCELLATION_REASON2), .N, by = CANCELLATION_REASON2] %>%
ggplot(aes(x = CANCELLATION_REASON2, y = N)) +
geom_histogram(stat = 'identity')
[As a function]
cancelReasonUpdate <- function(inStr){
switch(inStr,
"A" = "Airline",
"B" = "Weather",
"C" = "National Air System",
"D" = "Security",
NA)
}
flights[ , CANCELLATION_REASON2 := map_chr(CANCELLATION_REASON,cancelReasonUpdate)]
flights[ , CANCELLATION_REASON2 := NULL]
flights[ , MaxAvgSpeed := max(AvgSpeed, na.rm = TRUE), by = .(ORIGIN_AIRPORT, DESTINATION_AIRPORT)]
note that the group level statistic is now added to each observation
6) join
- Load the
airports
dataset
airports <- fread("Data/airports.csv")
airports %>% head() %>% knitr::kable()
ABE |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.44040 |
ABI |
Abilene Regional Airport |
Abilene |
TX |
USA |
32.41132 |
-99.68190 |
ABQ |
Albuquerque International Sunport |
Albuquerque |
NM |
USA |
35.04022 |
-106.60919 |
ABR |
Aberdeen Regional Airport |
Aberdeen |
SD |
USA |
45.44906 |
-98.42183 |
ABY |
Southwest Georgia Regional Airport |
Albany |
GA |
USA |
31.53552 |
-84.19447 |
ACK |
Nantucket Memorial Airport |
Nantucket |
MA |
USA |
41.25305 |
-70.06018 |
a. inner join
merge(x = flights,
y = airports,
by.x = 'ORIGIN_AIRPORT',
by.y = 'IATA_CODE') %>% head() %>% knitr::kable()
ABE |
2015 |
1 |
1 |
4 |
EV |
5103 |
N846AS |
DTW |
600 |
552 |
-8 |
12 |
604 |
113 |
101 |
84 |
425 |
728 |
5 |
753 |
733 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
303.5714 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
ABE |
2015 |
1 |
1 |
4 |
EV |
5579 |
N136EV |
ATL |
635 |
628 |
-7 |
13 |
641 |
139 |
126 |
110 |
692 |
831 |
3 |
854 |
834 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
377.4545 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
ABE |
2015 |
1 |
1 |
4 |
EV |
5267 |
N131EV |
ATL |
1600 |
1600 |
0 |
9 |
1609 |
140 |
119 |
103 |
692 |
1752 |
7 |
1820 |
1759 |
-21 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
403.1068 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
ABE |
2015 |
1 |
2 |
5 |
EV |
5103 |
N849AS |
DTW |
600 |
600 |
0 |
9 |
609 |
113 |
100 |
85 |
425 |
734 |
6 |
753 |
740 |
-13 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
300.0000 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
ABE |
2015 |
1 |
2 |
5 |
EV |
6144 |
N15912 |
ORD |
600 |
607 |
7 |
8 |
615 |
140 |
139 |
125 |
654 |
720 |
6 |
720 |
726 |
6 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
313.9200 |
445.9091 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
ABE |
2015 |
1 |
2 |
5 |
EV |
5579 |
N538CA |
ATL |
635 |
627 |
-8 |
13 |
640 |
139 |
143 |
120 |
692 |
840 |
10 |
854 |
850 |
-4 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
346.0000 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
b. left outer join
flights[airports, on = c(ORIGIN_AIRPORT = 'IATA_CODE')] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
EV |
5103 |
N846AS |
ABE |
DTW |
600 |
552 |
-8 |
12 |
604 |
113 |
101 |
84 |
425 |
728 |
5 |
753 |
733 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
303.5714 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
1 |
4 |
EV |
5579 |
N136EV |
ABE |
ATL |
635 |
628 |
-7 |
13 |
641 |
139 |
126 |
110 |
692 |
831 |
3 |
854 |
834 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
377.4545 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
1 |
4 |
EV |
5267 |
N131EV |
ABE |
ATL |
1600 |
1600 |
0 |
9 |
1609 |
140 |
119 |
103 |
692 |
1752 |
7 |
1820 |
1759 |
-21 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
403.1068 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
5103 |
N849AS |
ABE |
DTW |
600 |
600 |
0 |
9 |
609 |
113 |
100 |
85 |
425 |
734 |
6 |
753 |
740 |
-13 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
300.0000 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
6144 |
N15912 |
ABE |
ORD |
600 |
607 |
7 |
8 |
615 |
140 |
139 |
125 |
654 |
720 |
6 |
720 |
726 |
6 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
313.9200 |
445.9091 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
5579 |
N538CA |
ABE |
ATL |
635 |
627 |
-8 |
13 |
640 |
139 |
143 |
120 |
692 |
840 |
10 |
854 |
850 |
-4 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
346.0000 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
flights[airports, on = "ORIGIN_AIRPORT == IATA_CODE"] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
EV |
5103 |
N846AS |
ABE |
DTW |
600 |
552 |
-8 |
12 |
604 |
113 |
101 |
84 |
425 |
728 |
5 |
753 |
733 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
303.5714 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
1 |
4 |
EV |
5579 |
N136EV |
ABE |
ATL |
635 |
628 |
-7 |
13 |
641 |
139 |
126 |
110 |
692 |
831 |
3 |
854 |
834 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
377.4545 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
1 |
4 |
EV |
5267 |
N131EV |
ABE |
ATL |
1600 |
1600 |
0 |
9 |
1609 |
140 |
119 |
103 |
692 |
1752 |
7 |
1820 |
1759 |
-21 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
403.1068 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
5103 |
N849AS |
ABE |
DTW |
600 |
600 |
0 |
9 |
609 |
113 |
100 |
85 |
425 |
734 |
6 |
753 |
740 |
-13 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
300.0000 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
6144 |
N15912 |
ABE |
ORD |
600 |
607 |
7 |
8 |
615 |
140 |
139 |
125 |
654 |
720 |
6 |
720 |
726 |
6 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
313.9200 |
445.9091 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
5579 |
N538CA |
ABE |
ATL |
635 |
627 |
-8 |
13 |
640 |
139 |
143 |
120 |
692 |
840 |
10 |
854 |
850 |
-4 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
346.0000 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
c. inner join
flights[airports, on = c(ORIGIN_AIRPORT = 'IATA_CODE'), nomatch = 0] %>% head() %>% knitr::kable()
2015 |
1 |
1 |
4 |
EV |
5103 |
N846AS |
ABE |
DTW |
600 |
552 |
-8 |
12 |
604 |
113 |
101 |
84 |
425 |
728 |
5 |
753 |
733 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
303.5714 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
1 |
4 |
EV |
5579 |
N136EV |
ABE |
ATL |
635 |
628 |
-7 |
13 |
641 |
139 |
126 |
110 |
692 |
831 |
3 |
854 |
834 |
-20 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
377.4545 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
1 |
4 |
EV |
5267 |
N131EV |
ABE |
ATL |
1600 |
1600 |
0 |
9 |
1609 |
140 |
119 |
103 |
692 |
1752 |
7 |
1820 |
1759 |
-21 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-01 |
403.1068 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
5103 |
N849AS |
ABE |
DTW |
600 |
600 |
0 |
9 |
609 |
113 |
100 |
85 |
425 |
734 |
6 |
753 |
740 |
-13 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
300.0000 |
392.3077 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
6144 |
N15912 |
ABE |
ORD |
600 |
607 |
7 |
8 |
615 |
140 |
139 |
125 |
654 |
720 |
6 |
720 |
726 |
6 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
313.9200 |
445.9091 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
2015 |
1 |
2 |
5 |
EV |
5579 |
N538CA |
ABE |
ATL |
635 |
627 |
-8 |
13 |
640 |
139 |
143 |
120 |
692 |
840 |
10 |
854 |
850 |
-4 |
0 |
FALSE |
|
NA |
NA |
NA |
NA |
NA |
2015-01-02 |
346.0000 |
494.2857 |
Lehigh Valley International Airport |
Allentown |
PA |
USA |
40.65236 |
-75.4404 |
d. not join (a.k.a anti-join)
airports[!flights, on = "IATA_CODE == ORIGIN_AIRPORT"] %>% head() %>% knitr::kable()
- Ref: Cheat sheet of
data.table
: Click
End of Document