library(tidyverse) library(mdsr) # install.packages("nycflights13") library(nycflights13) ## Don't run this yet, it's just an example q <- flights %>% filter( year == 2016 & month == 9, dest == "JFK" ) %>% inner_join(carriers, by = c("carrier" = "carrier")) %>% group_by(name) %>% summarize( N = n(), pct_ontime = sum(arr_delay <= 15) / n() ) %>% filter(N >= 100) %>% arrange(desc(pct_ontime)) head(q, 4) ## creating tbl_sql objects 'flights' and 'carriers' (NOT data frames anymore) db <- dbConnect_scidb("airlines") flights <- tbl(db, "flights") carriers <- tbl(db, "carriers") ## ----connect-show, eval=FALSE------------------------------------------------- ## library(tidyverse) ## library(mdsr) ## db <- dbConnect_scidb("airlines") ## flights <- tbl(db, "flights") ## carriers <- tbl(db, "carriers") ## checking the class of the created tbl objects class(flights) class(carriers) ## Now run the query called 'q' above, and use show_query to convert it into SQL code: show_query(q) ## Note the 'translated' SQL code is a bit ugly, with temporary table names ######## The following code is a cleaner version of the SQL query: ## SELECT ## c.name, ## SUM(1) AS N, ## SUM(arr_delay <= 15) / SUM(1) AS pct_ontime ## FROM flights AS f ## JOIN carriers AS c ON f.carrier = c.carrier ## WHERE year = 2016 AND month = 9 ## AND dest = 'JFK' ## GROUP BY name ## HAVING N >= 100 ## ORDER BY pct_ontime DESC ## LIMIT 0,4; ## The 'translate_sql' function in the 'dbplyr' package translates simple R commands to SQL commands: library(dbplyr) con <- simulate_dbi() translate_sql(mean(arr_delay, na.rm = TRUE),con=con) # Slight change to book's code to avoid an error... ## The 'translate_sql' function doesn't work with all R functions, just the most basic ones. It doesn't work with this copy of 'paste0': my_paste <- paste0 translate_sql(my_paste("this", "is", "a", "string"),con=con) # It does work with the original 'paste0' function, though: translate_sql(paste0("this", "is", "a", "string"),con=con) ## This 'mutate' operation will not work: carriers %>% mutate(name_code = my_paste(name, "(", carrier, ")")) class(carriers) ## This 'mutate' operation will be translated correctly: carriers %>% mutate(name_code = paste0(name, "(", carrier, ")")) class(carriers) ## Or we can simply use the corresponding SQL command, CONCAT, directly: carriers %>% mutate(name_code = CONCAT(name, "(", carrier, ")")) ## Using collect() allows the 'mutate' operation involving 'my_paste' to work, and the result returns a data frame: carriers %>% collect() %>% mutate(name_code = my_paste(name, "(", carrier, ")")) ## Comparing the memory usage when we keep the table on the SQL server versus ... carriers %>% object.size() %>% print(units = "Kb") ## ... when we pull the data into R: carriers %>% collect() %>% object.size() %>% print(units = "Kb") ## How much memory does a data frame with 100 columns and 1 million rows take up? n <- 100 * 1e6 x <- matrix(runif(n), ncol = 100) dim(x) print(object.size(x), units = "Mb") # removing the object from the workspace now: rm(x) ## Install and load package 'sqldf': #install.packages("sqldf") library(sqldf) ## SHOW TABLES; ## Describing the 'fields" (columns) in a table: ## DESCRIBE airports; # This works in the MySQL implementation: # This is the equivalent in SQLite: sqldf(" PRAGMA table_info(airports); ") ## SELECT * FROM flights; ## SELECT * FROM flights LIMIT 0,10; data(flights) # load the 'flights' table into the workspace sqldf("select * from flights limit 0,10") ## SELECT year, month, day, dep_time, sched_dep_time, dep_delay, origin ## FROM flights ## LIMIT 0, 10; sqldf("SELECT year, month, day, dep_time, sched_dep_time, dep_delay, origin FROM flights LIMIT 0, 10") ### Selects two columns from 'airports': the faa codes and the airport names: ## SELECT faa, name FROM airports; sqldf("SELECT faa, name FROM airports") ## SELECT ## name, ## CONCAT('(', lat, ', ', lon, ')') ## FROM airports ## LIMIT 0, 6; sqldf("SELECT name, CONCAT('(', lat, ', ', lon, ')') FROM airports LIMIT 0, 6") ## The latitude/longitude column is correct in the table, but the column name is ugly. ## Let's rename it using AS: ## SELECT ## name, ## CONCAT('(', lat, ', ', lon, ')') AS coords ## FROM airports ## LIMIT 0, 6; sqldf("SELECT name, CONCAT('(', lat, ', ', lon, ')') AS coords FROM airports LIMIT 0, 6") ## Let's rename 'name' to 'airport_name' using AS: sqldf("SELECT name AS airport_name, CONCAT('(', lat, ', ', lon, ')') AS coords FROM airports LIMIT 0, 6") ### Using the WHERE clause to pick only the rows that meet some condition (changed the condition from the book example): sqldf("SELECT year, month, day, origin, dest, flight, carrier FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6") ## This use of the SQLite function 'strftime' doesn't produce quite the right result: sqldf(" SELECT strftime('%Y-%m-%d', CONCAT(year, '-', month, '-', day)) AS theDate, origin, flight, carrier FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6 ") ## It works OK without 'strftime': sqldf(" SELECT CONCAT(year, '-', month, '-', day) AS theDate, origin, flight, carrier FROM flights WHERE theDate = '2013-6-26' AND origin = 'LGA' LIMIT 0, 6 ") ## Note the difference between the BETWEEN and IN operators used in logical conditions: sqldf(" SELECT DISTINCT CONCAT(year, '-', month, '-', day) AS theDate FROM flights WHERE year = 2013 AND month = 6 AND day BETWEEN 26 and 30 AND origin = 'LGA' LIMIT 0, 6 ") sqldf(" SELECT DISTINCT CONCAT(year, '-', month, '-', day) AS theDate FROM flights WHERE year = 2013 AND month = 6 AND day IN (26, 30) AND origin = 'LGA' LIMIT 0, 6 ") ## loading packages: library(mdsr) library(DBI) db <- dbConnect_scidb("airlines") ### Note that in a WHERE clause, the AND clauses are always evaluated before the OR clauses: ## /* returns more records */ ## This returns records from the 26th of any month, not just June and not just 2013: sqldf(" SELECT COUNT(*) AS N FROM flights WHERE year = 2013 AND month = 6 OR day = 26 AND origin = 'LGA' ") ## /* returns fewer records */ ## This returns records from the 26th of any month or any records from June, but ONLY for flights in 2013: sqldf(" SELECT COUNT(*) AS N FROM flights WHERE year = 2013 AND (month = 6 OR day = 26) AND origin = 'LGA'; ") ## Use of GROUP BY to get counts by carrier: sqldf(" SELECT carrier, COUNT(*) AS numFlights, SUM(1) AS numFlightsAlso FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' GROUP BY carrier; ") sqldf(" SELECT carrier, COUNT(*) AS numFlights, MIN(dep_time) FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' GROUP BY carrier; ") ## Doesn't work with SQLite functions, may work with regular SQL: sqldf(" SELECT carrier, COUNT(*) AS numFlights, MAKETIME( IF(LENGTH(MIN(dep_time)) = 3, LEFT(MIN(dep_time), 1), LEFT(MIN(dep_time), 2) ), RIGHT(MIN(dep_time), 2), 0 ) AS firstDepartureTime FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' GROUP BY carrier LIMIT 0, 6; ") ## This shows the worst arrival delay, by airline, for 6/26/2013: sqldf(" SELECT carrier, COUNT(*) AS numFlights, MAX(arr_delay) AS WorstDelay FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' GROUP BY carrier LIMIT 0, 6; ") ## This shows the means and variances of arrival delays, by airline, for all of 2013: sqldf(" SELECT carrier, COUNT(*) AS numFlights, AVG(arr_delay) AS DelayMean, ( COUNT(*)*SUM(arr_delay * arr_delay) - (SUM(arr_delay)*SUM(arr_delay)) ) / ( (COUNT(*)-1)*(COUNT(*)) ) AS DelayVariance FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY carrier LIMIT 0, 15; ") ### Grouping by multiple columns: ## This shows the means of arrival delays, by airline/destination combination, for June 2013: sqldf(" SELECT carrier, dest, COUNT(*) AS numFlights, AVG(arr_delay) AS DelayMean FROM flights WHERE year = 2013 AND month = 6 AND origin = 'LGA' GROUP BY carrier, dest LIMIT 0, 40; ") ## Ordering the result with ORDER BY: sqldf(" SELECT dest, SUM(1) AS numFlights FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY dest ORDER BY numFlights DESC LIMIT 0, 10; ") ## Can also use ORDER BY without GROUP BY (this will print the results alphabetically by airport_name): ## Note it also works even though 'airport_name' is an alias we created: sqldf("SELECT name AS airport_name, CONCAT('(', lat, ', ', lon, ')') AS coords FROM airports ORDER BY airport_name LIMIT 0, 6") ### One more example of ORDER BY without GROUP BY (this lists results in reverse alphabetical order by carrier): sqldf("SELECT year, month, day, origin, dest, flight, carrier FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' ORDER BY carrier DESC LIMIT 0, 16") ## Which destinations have the lowest average delay for 2013 flights from LaGuardia? sqldf(" SELECT dest, SUM(1) AS numFlights, AVG(arr_delay) AS avg_arr_delay FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY dest ORDER BY avg_arr_delay ASC LIMIT 0, 6; ") ### Restricting the result set with HAVING: ## Among all destinations with at least 730 flights in 2013 (at least 2 flights per day), ## which destinations have the lowest average delay for 2013 flights from LaGuardia? sqldf(" SELECT dest, SUM(1) AS numFlights, AVG(arr_delay) AS avg_arr_delay FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY dest HAVING numFlights > 365 * 2 ORDER BY avg_arr_delay ASC LIMIT 0, 6; ") # Note this HAVING clause works even though it's based on a derived column alias # (only true for some SQL implementations) ### The following does NOT work, because 'numFlights' and 'avg_arr_delay' are part of the result set, ### not part of the original data table, so we can't use 'numFlights' in the WHERE condition: sqldf(" SELECT dest, SUM(1) AS numFlights, AVG(arr_delay) AS avg_arr_delay FROM flights WHERE year = 2013 AND origin = 'LGA' AND numFlights > 365 * 2 GROUP BY dest ORDER BY avg_arr_delay ASC LIMIT 0, 6; ") ## Examples showing the usage of LIMIT: sqldf(" SELECT dest, SUM(1) AS numFlights, AVG(arr_delay) AS avg_arr_delay FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY dest HAVING numFlights > 365 * 2 ORDER BY avg_arr_delay ASC LIMIT 0, 6; ") sqldf(" SELECT dest, SUM(1) AS numFlights, AVG(arr_delay) AS avg_arr_delay FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY dest HAVING numFlights > 365 * 2 ORDER BY avg_arr_delay ASC LIMIT 5, 10; ") sqldf(" SELECT dest, SUM(1) AS numFlights, AVG(arr_delay) AS avg_arr_delay FROM flights WHERE year = 2013 AND origin = 'LGA' GROUP BY dest HAVING numFlights > 365 * 2 ORDER BY avg_arr_delay ASC LIMIT 0, 4; ") #### Examples of Joins in SQL: # A simple query: sqldf(" SELECT origin, dest, flight, carrier FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6; ") # This just lists the FAA airport code, not the full airport name. # The 'airports' table has the FAA code and also the full airport name. # If we join these tables together, we can get the full airport name for the destinations, along with the information on the flights. # Note the JOIN does an INNER JOIN. sqldf(" SELECT origin, dest, airports.name AS dest_name, flight, carrier FROM flights JOIN airports ON flights.dest = airports.faa WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6; ") ## Using table aliases: ## Making 'o' the alias for 'flights', and making 'a' the alias for 'airports': sqldf(" SELECT origin, dest, a.name AS dest_name, flight, carrier FROM flights AS o JOIN airports AS a ON o.dest = a.faa WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6; ") # The 'airlines' table has the carrier code and also the full carrier (airline) name. # If we join 'flights' and 'airlines' together, we can get the full carrier name for the destinations, along with the information on the flights. # Note the book's code calls the table 'carriers', but the name here is actually 'airlines'... sqldf(" SELECT dest, a.name AS dest_name, o.carrier, c.name AS carrier_name FROM flights AS o JOIN airports AS a ON o.dest = a.faa JOIN airlines AS c ON o.carrier = c.carrier WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6; ") sqldf(" SELECT flight, a2.name AS orig_name, a1.name AS dest_name, c.name AS carrier_name FROM flights AS o JOIN airports AS a1 ON o.dest = a1.faa JOIN airports AS a2 ON o.origin = a2.faa JOIN airlines AS c ON o.carrier = c.carrier WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' LIMIT 0, 6; ") ## Describing the 'fields" (columns) in a table: ## DESCRIBE airports; # This works in the MySQL implementation: # This is the equivalent in SQLite: sqldf(" PRAGMA table_info(airports); ") ### A LEFT JOIN: ## This will capture any destination airports in the 'flights' table whose airport information is not available in the 'airports' table sqldf(" SELECT year, month, day, origin, dest, a.name AS dest_name, flight, carrier FROM flights AS o LEFT JOIN airports AS a ON o.dest = a.faa WHERE year = 2013 AND month = 6 AND day = 26 AND a.name is null LIMIT 0, 6; ") # Note the destinations returned that don't have a match in the 'airports' table are all in Puerto Rico... ### Combining two separate queries with UNION: sqldf(" SELECT year, month, day, origin, dest, flight, carrier FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'LGA' AND dest = 'MSP' UNION SELECT year, month, day, origin, dest, flight, carrier FROM flights WHERE year = 2013 AND month = 6 AND day = 26 AND origin = 'JFK' AND dest = 'ORD' LIMIT 0,10; ") ## The LIMIT clause applies to the combined UNION, not to the second query ## Run without the LIMIT clause to see the complete UNION results ## Subqueries: ## Picking airports in time zones west of the contiguous U.S., such as Alaska and Hawaii: # Change column name in book's code from 'city' to 'tzone': sqldf(" SELECT faa, name, tz, tzone FROM airports AS a WHERE tz < -8 LIMIT 0, 6; ") # Or SELECT * will return ALL the columns in the table: sqldf(" SELECT * FROM airports AS a WHERE tz < -8 LIMIT 0, 6; ") ## Taking the results of some first query (about flights to the West Coast and beyond) and then doing another query on that, ## so that the first query is a *subquery* sqldf(" SELECT dest, a.name AS dest_name, SUM(1) AS N, COUNT(distinct carrier) AS numCarriers FROM flights AS o LEFT JOIN airports AS a ON o.dest = a.faa WHERE year = 2013 AND origin = 'JFK' AND dest IN (SELECT faa FROM airports WHERE tz < -7) GROUP BY dest; ")