1. Package loading

library(data.table)

2. Example data: flights

flights <- fread("Data/flights.csv")
head(flights) %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 1 1 4 AS 98 N407AS ANC SEA 5 2354 -11 21 15 205 194 169 1448 404 4 430 408 -22 0 0 NA NA NA NA NA
2015 1 1 4 AA 2336 N3KUAA LAX PBI 10 2 -8 12 14 280 279 263 2330 737 4 750 741 -9 0 0 NA NA NA NA NA
2015 1 1 4 US 840 N171US SFO CLT 20 18 -2 16 34 286 293 266 2296 800 11 806 811 5 0 0 NA NA NA NA NA
2015 1 1 4 AA 258 N3HYAA LAX MIA 20 15 -5 15 30 285 281 258 2342 748 8 805 756 -9 0 0 NA NA NA NA NA
2015 1 1 4 AS 135 N527AS SEA ANC 25 24 -1 11 35 235 215 199 1448 254 5 320 259 -21 0 0 NA NA NA NA NA
2015 1 1 4 DL 806 N3730B SFO MSP 25 20 -5 18 38 217 230 206 1589 604 6 602 610 8 0 0 NA NA NA NA NA

3. What is data.table?

class(flights)
## [1] "data.table" "data.frame"

It is an extension of the data.frame class.


4. Subsetting a data.table

\[ DT[i, j, by] \\ \begin{aligned} i&: \text{filters rows (analagous to a WHERE clause in SQL)} \\ j&: \text{is how you select/compute on columns} \\ by&: \text{is where you can group computations} \end{aligned} \]


1) \(i\)

  • first index is ALWAYS rows, regardless of commas
identical(flights[2], flights[2,])
## [1] TRUE


  • Compare to data.frame
as.data.frame(flights)[2]  %>% dim
## [1] 5819079       1
as.data.frame(flights)[2,] %>% dim
## [1]  1 31


  • Use \(i\) to select rows
flights[1:2] # first and second row
##    YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER
## 1: 2015     1   1           4      AS            98      N407AS
## 2: 2015     1   1           4      AA          2336      N3KUAA
##    ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME
## 1:            ANC                 SEA                   5           2354
## 2:            LAX                 PBI                  10              2
##    DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 1:             -11       21         15            205          194
## 2:              -8       12         14            280          279
##    AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME
## 1:      169     1448       404       4               430          408
## 2:      263     2330       737       4               750          741
##    ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY
## 1:           -22        0         0                                   NA
## 2:            -9        0         0                                   NA
##    SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 1:             NA            NA                  NA            NA
## 2:             NA            NA                  NA            NA
flights[2:1] # second and first row
##    YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER
## 1: 2015     1   1           4      AA          2336      N3KUAA
## 2: 2015     1   1           4      AS            98      N407AS
##    ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME
## 1:            LAX                 PBI                  10              2
## 2:            ANC                 SEA                   5           2354
##    DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME
## 1:              -8       12         14            280          279
## 2:             -11       21         15            205          194
##    AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME
## 1:      263     2330       737       4               750          741
## 2:      169     1448       404       4               430          408
##    ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY
## 1:            -9        0         0                                   NA
## 2:           -22        0         0                                   NA
##    SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
## 1:             NA            NA                  NA            NA
## 2:             NA            NA                  NA            NA


  • Logical expressions
flights[MONTH == 1]
flights[MONTH == 1 & DISTANCE > 1000]


  • identical() function
identical(flights[-1:-100000], flights[!1:100000])
## [1] TRUE


  • order(): optimized within a data.table
  • makes sorting a cinch
  • sort by one column and then another
  • minus (‘-’) means descending order
flights[order(TAIL_NUMBER,-ARRIVAL_DELAY)] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 1 1 4 F9 865 MSO DEN 600 NA NA NA NA 120 NA NA 679 NA NA 800 NA NA 0 1 A NA NA NA NA NA
2015 1 1 4 F9 1256 ORD MIA 600 NA NA NA NA 180 NA NA 1197 NA NA 1000 NA NA 0 1 A NA NA NA NA NA
2015 1 1 4 UA 641 LAX HNL 845 NA NA NA NA 344 NA NA 2556 NA NA 1229 NA NA 0 1 A NA NA NA NA NA
2015 1 1 4 UA 1412 ORD PHL 1010 NA NA NA NA 121 NA NA 678 NA NA 1311 NA NA 0 1 A NA NA NA NA NA
2015 1 1 4 US 1883 SLC PHL 1030 NA NA NA NA 243 NA NA 1927 NA NA 1633 NA NA 0 1 A NA NA NA NA NA
2015 1 1 4 F9 1257 MIA ORD 1050 NA NA NA NA 200 NA NA 1197 NA NA 1310 NA NA 0 1 A NA NA NA NA NA


sort by a function on our data

flights[order(substr(ORIGIN_AIRPORT,1,1))] %>% head %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 10 1 4 AA 1230 N3DBAA 14747 11298 5 15 10 15 30 237 208 189 1660 539 4 602 543 -19 0 0 NA NA NA NA NA
2015 10 1 4 DL 1805 N696DL 14771 13487 5 16 11 14 30 213 192 171 1589 521 7 538 528 -10 0 0 NA NA NA NA NA
2015 10 1 4 NK 612 N602NK 12889 13487 5 2400 -5 15 15 177 168 149 1299 444 4 502 448 -14 0 0 NA NA NA NA NA
2015 10 1 4 AA 260 N3GNAA 12892 13303 10 7 -3 28 35 296 306 271 2342 806 7 806 813 7 0 0 NA NA NA NA NA
2015 10 1 4 AA 1982 N914UY 14771 11057 10 8 -2 12 20 291 282 257 2296 737 13 801 750 -11 0 0 NA NA NA NA NA
2015 10 1 4 UA 684 N495UA 14771 13930 14 13 -1 12 25 261 229 212 1846 557 5 635 602 -33 0 0 NA NA NA NA NA


  • chain queries together
flights[!is.na(ARRIVAL_DELAY)][order(TAIL_NUMBER,-ARRIVAL_DELAY)] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 9 25 5 WN 2736 7819A DSM MDW 550 1321 451 11 1332 80 65 50 306 1422 4 710 1426 436 0 0 0 0 436 0 0
2015 8 5 3 WN 815 7819A GEG DEN 540 1250 430 8 1258 130 121 106 836 1544 7 850 1551 421 0 0 0 0 421 0 0
2015 11 21 6 WN 2279 7819A ALB MCO 1805 2324 319 11 2335 190 180 164 1073 219 5 2115 224 309 0 0 0 0 25 284 0
2015 11 21 6 WN 2279 7819A MDW ALB 1450 1932 282 30 2002 105 116 83 717 2225 3 1735 2228 293 0 0 11 0 0 245 37
2015 7 22 3 WN 840 7819A PHL MCO 1650 1943 173 10 1953 155 129 112 861 2145 7 1925 2152 147 0 0 0 0 76 71 0
2015 7 22 3 WN 4636 7819A MCO RIC 2000 2225 145 9 2234 110 105 91 667 5 5 2150 10 140 0 0 0 0 0 140 0


flights %>%
  .[!is.na(ARRIVAL_DELAY)] %>%
  .[order(TAIL_NUMBER,-ARRIVAL_DELAY)] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 9 25 5 WN 2736 7819A DSM MDW 550 1321 451 11 1332 80 65 50 306 1422 4 710 1426 436 0 0 0 0 436 0 0
2015 8 5 3 WN 815 7819A GEG DEN 540 1250 430 8 1258 130 121 106 836 1544 7 850 1551 421 0 0 0 0 421 0 0
2015 11 21 6 WN 2279 7819A ALB MCO 1805 2324 319 11 2335 190 180 164 1073 219 5 2115 224 309 0 0 0 0 25 284 0
2015 11 21 6 WN 2279 7819A MDW ALB 1450 1932 282 30 2002 105 116 83 717 2225 3 1735 2228 293 0 0 11 0 0 245 37
2015 7 22 3 WN 840 7819A PHL MCO 1650 1943 173 10 1953 155 129 112 861 2145 7 1925 2152 147 0 0 0 0 76 71 0
2015 7 22 3 WN 4636 7819A MCO RIC 2000 2225 145 9 2234 110 105 91 667 5 5 2150 10 140 0 0 0 0 0 140 0


2) \(j\)

  • computations on columns
# select columns by name in the second index parameter
flights[ , DESTINATION_AIRPORT] %>% head() %>% knitr::kable()           # as a vector
x
SEA
PBI
CLT
MIA
ANC
MSP
flights[ , list(DESTINATION_AIRPORT)] %>% head() %>% knitr::kable()     # as a data.table
DESTINATION_AIRPORT
SEA
PBI
CLT
MIA
ANC
MSP


  • use list to select multiple columns
flights[ , list(DESTINATION_AIRPORT, ORIGIN_AIRPORT)] %>% head() %>% knitr::kable()
DESTINATION_AIRPORT ORIGIN_AIRPORT
SEA ANC
PBI LAX
CLT SFO
MIA LAX
ANC SEA
MSP SFO


  • .() is the same as list() in data.table
flights[ , .(DESTINATION_AIRPORT, ORIGIN_AIRPORT)] %>% head() %>% knitr::kable()
DESTINATION_AIRPORT ORIGIN_AIRPORT
SEA ANC
PBI LAX
CLT SFO
MIA LAX
ANC SEA
MSP SFO


  • also computable on columns
flights[ , unique(ORIGIN_AIRPORT)]  %>% head() %>% knitr::kable()
x
ANC
LAX
SFO
SEA
LAS
DEN
flights[ , sum(AIR_TIME, na.rm = TRUE) / (1e6 * 60)]  %>% head() %>% knitr::kable()
x
10.81011
flights[ , summary(as.factor(CANCELLATION_REASON))]  %>% head() %>% knitr::kable()
x
5729195
A 25262
B 48851
C 15749
D 22
flights[ , .(AIR_TIME, AIR_TIME/60)] %>% head() %>% knitr::kable()# return multiple columns
AIR_TIME V2
169 2.816667
263 4.383333
266 4.433333
258 4.300000
199 3.316667
206 3.433333


  • name the items
flights[ , .(AIR_TIME, AIR_TIME_HOURS = AIR_TIME/60)] %>% head() %>% knitr::kable()
AIR_TIME AIR_TIME_HOURS
169 2.816667
263 4.383333
266 4.433333
258 4.300000
199 3.316667
206 3.433333

Anytime the \(j\) expression returns a list, the whole expression will return a data.table.


3) \(i\) and \(j\)

Q1. How many flights were delayed touching down?
flights[ARRIVAL_DELAY>0, .N] 
## [1] 2086896

.N is a convenience symbol to get the number of rows in a data.table


Q2. What are some summary statistics of the delayed flights (in hours)?
flights[ARRIVAL_DELAY>0, summary(ARRIVAL_DELAY/60)]
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.01667  0.10000  0.25000  0.55188  0.63333 32.85000


Q3. How many flights were on each day of the week?
flights[ARRIVAL_DELAY>0, plot(table(DAY_OF_WEEK))] 

## [1] 1 2 3 4 5 6 7
  • we don’t have to return a data.table
  • an expression that returns a list in \(j\) would, but here we want to plot, and data.table evaluates the expression correctly.


4) by: grouping

Q4. How many outgoing flights did each aiport have?
flights[ , .N, by = ORIGIN_AIRPORT] %>% head() %>% knitr::kable()
ORIGIN_AIRPORT N
ANC 16005
LAX 194673
SFO 148008
SEA 110899
LAS 133181
DEN 196055


  • Order it to see the biggest airports
flights[ , .N, by = ORIGIN_AIRPORT][order(-N)] %>% head() %>% knitr::kable()
ORIGIN_AIRPORT N
ATL 346836
ORD 285884
DFW 239551
DEN 196055
LAX 194673
SFO 148008


Q5. Which airports had the most cancelled flights?
flights[CANCELLED == 1, .N, by = ORIGIN_AIRPORT][order(-N)]
##      ORIGIN_AIRPORT    N
##   1:            ORD 8548
##   2:            DFW 6254
##   3:            LGA 4531
##   4:            EWR 3110
##   5:            BOS 2654
##  ---                    
## 521:          13931    1
## 522:          14730    1
## 523:          10158    1
## 524:          11097    1
## 525:          14986    1


flights %>%
  extract(CANCELLED == 1, .N, by = ORIGIN_AIRPORT) %>%
  extract(order(-N))
# Here we use extract, magrittr's alias for [ rather than the .[] notation above


Q6. What’s the min and max departure delay for each airport, sorted by biggest difference between the two?
flights[ , .(MinDelay = min(DEPARTURE_DELAY, na.rm = TRUE), 
             MaxDelay = max(DEPARTURE_DELAY, na.rm = TRUE)), 
         by = ORIGIN_AIRPORT][order(MinDelay-MaxDelay)] %>% head() %>% knitr::kable()
ORIGIN_AIRPORT MinDelay MaxDelay
BHM -20 1988
RIC -23 1878
SAN -31 1670
ABQ -22 1649
DTW -25 1631
IND -25 1625


5) .SD: a special name for the subset of the data.table for each group

mtcarsDT <- as.data.table(mtcars, keep.rownames = 'Model')
mtcarsDT[ ,unique(cyl)] # we have three groups by cylinder count
## [1] 6 4 8
mtcarsDT[ , print(.SD), by = cyl] # so we have three data.tables that get printed
##             Model  mpg  disp  hp drat    wt  qsec vs am gear carb
## 1:      Mazda RX4 21.0 160.0 110 3.90 2.620 16.46  0  1    4    4
## 2:  Mazda RX4 Wag 21.0 160.0 110 3.90 2.875 17.02  0  1    4    4
## 3: Hornet 4 Drive 21.4 258.0 110 3.08 3.215 19.44  1  0    3    1
## 4:        Valiant 18.1 225.0 105 2.76 3.460 20.22  1  0    3    1
## 5:       Merc 280 19.2 167.6 123 3.92 3.440 18.30  1  0    4    4
## 6:      Merc 280C 17.8 167.6 123 3.92 3.440 18.90  1  0    4    4
## 7:   Ferrari Dino 19.7 145.0 175 3.62 2.770 15.50  0  1    5    6
##              Model  mpg  disp  hp drat    wt  qsec vs am gear carb
##  1:     Datsun 710 22.8 108.0  93 3.85 2.320 18.61  1  1    4    1
##  2:      Merc 240D 24.4 146.7  62 3.69 3.190 20.00  1  0    4    2
##  3:       Merc 230 22.8 140.8  95 3.92 3.150 22.90  1  0    4    2
##  4:       Fiat 128 32.4  78.7  66 4.08 2.200 19.47  1  1    4    1
##  5:    Honda Civic 30.4  75.7  52 4.93 1.615 18.52  1  1    4    2
##  6: Toyota Corolla 33.9  71.1  65 4.22 1.835 19.90  1  1    4    1
##  7:  Toyota Corona 21.5 120.1  97 3.70 2.465 20.01  1  0    3    1
##  8:      Fiat X1-9 27.3  79.0  66 4.08 1.935 18.90  1  1    4    1
##  9:  Porsche 914-2 26.0 120.3  91 4.43 2.140 16.70  0  1    5    2
## 10:   Lotus Europa 30.4  95.1 113 3.77 1.513 16.90  1  1    5    2
## 11:     Volvo 142E 21.4 121.0 109 4.11 2.780 18.60  1  1    4    2
##                   Model  mpg  disp  hp drat    wt  qsec vs am gear carb
##  1:   Hornet Sportabout 18.7 360.0 175 3.15 3.440 17.02  0  0    3    2
##  2:          Duster 360 14.3 360.0 245 3.21 3.570 15.84  0  0    3    4
##  3:          Merc 450SE 16.4 275.8 180 3.07 4.070 17.40  0  0    3    3
##  4:          Merc 450SL 17.3 275.8 180 3.07 3.730 17.60  0  0    3    3
##  5:         Merc 450SLC 15.2 275.8 180 3.07 3.780 18.00  0  0    3    3
##  6:  Cadillac Fleetwood 10.4 472.0 205 2.93 5.250 17.98  0  0    3    4
##  7: Lincoln Continental 10.4 460.0 215 3.00 5.424 17.82  0  0    3    4
##  8:   Chrysler Imperial 14.7 440.0 230 3.23 5.345 17.42  0  0    3    4
##  9:    Dodge Challenger 15.5 318.0 150 2.76 3.520 16.87  0  0    3    2
## 10:         AMC Javelin 15.2 304.0 150 3.15 3.435 17.30  0  0    3    2
## 11:          Camaro Z28 13.3 350.0 245 3.73 3.840 15.41  0  0    3    4
## 12:    Pontiac Firebird 19.2 400.0 175 3.08 3.845 17.05  0  0    3    2
## 13:      Ford Pantera L 15.8 351.0 264 4.22 3.170 14.50  0  1    5    4
## 14:       Maserati Bora 15.0 301.0 335 3.54 3.570 14.60  0  1    5    8
## Empty data.table (0 rows and 1 cols): cyl

Ref: https://www.youtube.com/watch?v=L6aT_oEhIKo


Q7. First two rows of flights from each day
flights[ , .SD[1:2], by = .(YEAR,MONTH,DAY)] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 1 1 4 AS 98 N407AS ANC SEA 5 2354 -11 21 15 205 194 169 1448 404 4 430 408 -22 0 0 NA NA NA NA NA
2015 1 1 4 AA 2336 N3KUAA LAX PBI 10 2 -8 12 14 280 279 263 2330 737 4 750 741 -9 0 0 NA NA NA NA NA
2015 1 2 5 AA 2400 N5CDAA LAX DFW 5 2359 -6 22 21 170 174 147 1235 448 5 455 453 -2 0 0 NA NA NA NA NA
2015 1 2 5 AS 98 N764AS ANC SEA 5 2353 -12 18 11 205 190 168 1448 359 4 430 403 -27 0 0 NA NA NA NA NA
2015 1 3 6 AA 2400 N5DKAA LAX DFW 5 5 0 23 28 170 174 148 1235 456 3 455 459 4 0 0 NA NA NA NA NA
2015 1 3 6 US 580 N117UW SLC PHL 10 16 6 18 34 244 236 212 1927 606 6 614 612 -2 0 0 NA NA NA NA NA


Q8. 100 random obs from every day

flights[ , .SD[sample(1:.N, 100)], by = .(YEAR, MONTH,DAY)] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
2015 1 1 4 DL 1353 N927DA PNS ATL 1203 1202 -1 8 1210 75 62 41 271 1351 13 1418 1404 -14 0 0 NA NA NA NA NA
2015 1 1 4 EV 6071 N18101 DEN SLC 1105 1124 19 13 1137 95 74 55 391 1232 6 1240 1238 -2 0 0 NA NA NA NA NA
2015 1 1 4 DL 1585 N918DL LGA MCO 1730 1743 13 20 1803 188 165 135 950 2018 10 2038 2028 -10 0 0 NA NA NA NA NA
2015 1 1 4 EV 5967 N14116 BNA DEN 1602 1618 16 13 1631 192 202 177 1014 1828 12 1814 1840 26 0 0 23 0 3 0 0
2015 1 1 4 WN 1044 N8652B AUS HOU 1430 1432 2 10 1442 50 66 30 148 1512 26 1520 1538 18 0 0 18 0 0 0 0
2015 1 1 4 WN 910 N949WN PVD MDW 1320 1322 2 8 1330 165 168 155 842 1505 5 1505 1510 5 0 0 NA NA NA NA NA


Q8. % of flights cancelled by airport?
flights[ , .(PctCancelled = mean(CANCELLED) %>% round(., 3)), by = ORIGIN_AIRPORT][order(-PctCancelled)] %>% head() %>% knitr::kable()
ORIGIN_AIRPORT PctCancelled
ITH 0.118
13964 0.111
MMH 0.103
10170 0.097
SUN 0.092
DVL 0.088


Q9. Remove all airports from the dataset that don’t have 10k flights
flights[ , .SD[.N >= 10000], by = ORIGIN_AIRPORT] %>% head() %>% knitr::kable()
ORIGIN_AIRPORT YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY
ANC 2015 1 1 4 AS 98 N407AS SEA 5 2354 -11 21 15 205 194 169 1448 404 4 430 408 -22 0 0 NA NA NA NA NA
ANC 2015 1 1 4 AS 108 N309AS SEA 45 41 -4 17 58 204 194 173 1448 451 4 509 455 -14 0 0 NA NA NA NA NA
ANC 2015 1 1 4 DL 1560 N3743H SEA 45 31 -14 25 56 210 200 171 1448 447 4 515 451 -24 0 0 NA NA NA NA NA
ANC 2015 1 1 4 AS 122 N413AS PDX 50 46 -4 11 57 215 201 187 1542 504 3 525 507 -18 0 0 NA NA NA NA NA
ANC 2015 1 1 4 AS 136 N431AS SEA 135 NA NA NA NA 205 NA NA 1448 NA NA 600 NA NA 0 1 A NA NA NA NA NA
ANC 2015 1 1 4 US 617 N804AW PHX 152 143 -9 21 204 323 322 298 2552 902 3 915 905 -10 0 0 NA NA NA NA NA
  • Here we’re saying to group by the airport of origin
  • So .SD in each case is the complete set of flights for an airport
  • We then filter that subset. We put in a single TRUE or FALSE, is the number of rows greater than 10000?
  • If it is, it returns TRUE and is recycled so that every row in the subset is returned.
  • If not, then the FALSE is returned, recycled for all rows, and no rows are returned.
  • data.table will build up the final dataset based on these little data.tables
  • Appending zero rows to a data.table means nothing, and the airports that don’t have enough rows are dropped from the final output.


Q10. Which airports don’t have 10000 flights?
flights[ , .SD[.N < 10000], by = ORIGIN_AIRPORT][ , unique(ORIGIN_AIRPORT)] %>% head()
## [1] "FAI" "PBG" "IAG" "PSE" "BQN" "GEG"


6) Assignment

  • there is an assignment operator that modifies in place (by reference)
  • jargon translation: it will alter the specific object you use it on


  • Create a date column that’s in date format
flights[ , Date := ymd(YEAR*10000+MONTH*100+DAY)] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY Date
2015 1 1 4 AS 98 N407AS ANC SEA 5 2354 -11 21 15 205 194 169 1448 404 4 430 408 -22 0 0 NA NA NA NA NA 2015-01-01
2015 1 1 4 AA 2336 N3KUAA LAX PBI 10 2 -8 12 14 280 279 263 2330 737 4 750 741 -9 0 0 NA NA NA NA NA 2015-01-01
2015 1 1 4 US 840 N171US SFO CLT 20 18 -2 16 34 286 293 266 2296 800 11 806 811 5 0 0 NA NA NA NA NA 2015-01-01
2015 1 1 4 AA 258 N3HYAA LAX MIA 20 15 -5 15 30 285 281 258 2342 748 8 805 756 -9 0 0 NA NA NA NA NA 2015-01-01
2015 1 1 4 AS 135 N527AS SEA ANC 25 24 -1 11 35 235 215 199 1448 254 5 320 259 -21 0 0 NA NA NA NA NA 2015-01-01
2015 1 1 4 DL 806 N3730B SFO MSP 25 20 -5 18 38 217 230 206 1589 604 6 602 610 8 0 0 NA NA NA NA NA 2015-01-01


  • Reformat the dataset into boolean format
flights[ , CANCELLED := ifelse(CANCELLED == 1, TRUE, FALSE)]


  • Create new columns
flights[ , AvgSpeed := DISTANCE / (AIR_TIME/60)]


  • Update the data.table selectively
flights[ CANCELLATION_REASON2 == 'A', CANCELLATION_REASON := 'Airline' ]
flights[ CANCELLATION_REASON2 == 'B', CANCELLATION_REASON := 'Weather'] 
flights[ CANCELLATION_REASON2 == 'C', CANCELLATION_REASON := 'National Air System'] 
flights[ CANCELLATION_REASON2 == 'D', CANCELLATION_REASON := 'Security'] 
flights[ CANCELLATION_REASON2 == '', CANCELLATION_REASON := NA] 
flights[!is.na(CANCELLATION_REASON2), .N, by = CANCELLATION_REASON2] %>%
    ggplot(aes(x = CANCELLATION_REASON2, y = N)) +
    geom_histogram(stat = 'identity')


[As a function]

cancelReasonUpdate <- function(inStr){
  switch(inStr,
         "A" = "Airline",
         "B" = "Weather",
         "C" = "National Air System",
         "D" = "Security",
               NA)
}

flights[ , CANCELLATION_REASON2 := map_chr(CANCELLATION_REASON,cancelReasonUpdate)]


  • Delete columns
flights[ , CANCELLATION_REASON2 := NULL]


  • Grouping
flights[ , MaxAvgSpeed := max(AvgSpeed, na.rm = TRUE), by = .(ORIGIN_AIRPORT, DESTINATION_AIRPORT)]

note that the group level statistic is now added to each observation


6) join

  • Load the airports dataset
airports <- fread("Data/airports.csv")
airports %>% head() %>% knitr::kable()
IATA_CODE AIRPORT CITY STATE COUNTRY LATITUDE LONGITUDE
ABE Lehigh Valley International Airport Allentown PA USA 40.65236 -75.44040
ABI Abilene Regional Airport Abilene TX USA 32.41132 -99.68190
ABQ Albuquerque International Sunport Albuquerque NM USA 35.04022 -106.60919
ABR Aberdeen Regional Airport Aberdeen SD USA 45.44906 -98.42183
ABY Southwest Georgia Regional Airport Albany GA USA 31.53552 -84.19447
ACK Nantucket Memorial Airport Nantucket MA USA 41.25305 -70.06018


a. inner join
merge(x = flights,
      y = airports,
      by.x = 'ORIGIN_AIRPORT',
      by.y = 'IATA_CODE') %>% head() %>% knitr::kable()
ORIGIN_AIRPORT YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY Date AvgSpeed MaxAvgSpeed AIRPORT CITY STATE COUNTRY LATITUDE LONGITUDE
ABE 2015 1 1 4 EV 5103 N846AS DTW 600 552 -8 12 604 113 101 84 425 728 5 753 733 -20 0 FALSE NA NA NA NA NA 2015-01-01 303.5714 392.3077 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
ABE 2015 1 1 4 EV 5579 N136EV ATL 635 628 -7 13 641 139 126 110 692 831 3 854 834 -20 0 FALSE NA NA NA NA NA 2015-01-01 377.4545 494.2857 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
ABE 2015 1 1 4 EV 5267 N131EV ATL 1600 1600 0 9 1609 140 119 103 692 1752 7 1820 1759 -21 0 FALSE NA NA NA NA NA 2015-01-01 403.1068 494.2857 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
ABE 2015 1 2 5 EV 5103 N849AS DTW 600 600 0 9 609 113 100 85 425 734 6 753 740 -13 0 FALSE NA NA NA NA NA 2015-01-02 300.0000 392.3077 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
ABE 2015 1 2 5 EV 6144 N15912 ORD 600 607 7 8 615 140 139 125 654 720 6 720 726 6 0 FALSE NA NA NA NA NA 2015-01-02 313.9200 445.9091 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
ABE 2015 1 2 5 EV 5579 N538CA ATL 635 627 -8 13 640 139 143 120 692 840 10 854 850 -4 0 FALSE NA NA NA NA NA 2015-01-02 346.0000 494.2857 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404


b. left outer join
flights[airports, on = c(ORIGIN_AIRPORT = 'IATA_CODE')] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY Date AvgSpeed MaxAvgSpeed AIRPORT CITY STATE COUNTRY LATITUDE LONGITUDE
2015 1 1 4 EV 5103 N846AS ABE DTW 600 552 -8 12 604 113 101 84 425 728 5 753 733 -20 0 FALSE NA NA NA NA NA 2015-01-01 303.5714 392.3077 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
2015 1 1 4 EV 5579 N136EV ABE ATL 635 628 -7 13 641 139 126 110 692 831 3 854 834 -20 0 FALSE NA NA NA NA NA 2015-01-01 377.4545 494.2857 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
2015 1 1 4 EV 5267 N131EV ABE ATL 1600 1600 0 9 1609 140 119 103 692 1752 7 1820 1759 -21 0 FALSE NA NA NA NA NA 2015-01-01 403.1068 494.2857 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
2015 1 2 5 EV 5103 N849AS ABE DTW 600 600 0 9 609 113 100 85 425 734 6 753 740 -13 0 FALSE NA NA NA NA NA 2015-01-02 300.0000 392.3077 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
2015 1 2 5 EV 6144 N15912 ABE ORD 600 607 7 8 615 140 139 125 654 720 6 720 726 6 0 FALSE NA NA NA NA NA 2015-01-02 313.9200 445.9091 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
2015 1 2 5 EV 5579 N538CA ABE ATL 635 627 -8 13 640 139 143 120 692 840 10 854 850 -4 0 FALSE NA NA NA NA NA 2015-01-02 346.0000 494.2857 Lehigh Valley International Airport Allentown PA USA 40.65236 -75.4404
flights[airports, on = "ORIGIN_AIRPORT == IATA_CODE"] %>% head() %>% knitr::kable()
YEAR MONTH DAY DAY_OF_WEEK AIRLINE FLIGHT_NUMBER TAIL_NUMBER ORIGIN_AIRPORT DESTINATION_AIRPORT SCHEDULED_DEPARTURE DEPARTURE_TIME DEPARTURE_DELAY TAXI_OUT WHEELS_OFF SCHEDULED_TIME ELAPSED_TIME AIR_TIME DISTANCE WHEELS_ON TAXI_IN SCHEDULED_ARRIVAL ARRIVAL_TIME ARRIVAL_DELAY DIVERTED CANCELLED CANCELLATION_REASON AIR_SYSTEM_DELAY SECURITY_DELAY AIRLINE_DELAY LATE_AIRCRAFT_DELAY WEATHER_DELAY Date AvgSpeed MaxAvgSpeed AIRPORT CITY STATE COUNTRY LATITUDE