library(tidyverse)
library(mdsr)

# install.packages("nycflights13")
library(nycflights13)

## Don't run this yet, it's just an example 
q <- flights %>%
  filter(
    year == 2016 & month == 9, 
    dest == "JFK"
  ) %>%
  inner_join(carriers, by = c("carrier" = "carrier")) %>%
  group_by(name) %>%
  summarize(
    N = n(), 
    pct_ontime = sum(arr_delay <= 15) / n()
  ) %>%
  filter(N >= 100) %>%
  arrange(desc(pct_ontime))
head(q, 4)


## creating tbl_sql objects 'flights' and 'carriers' (NOT data frames anymore)
db <- dbConnect_scidb("airlines")
flights <- tbl(db, "flights")
carriers <- tbl(db, "carriers")


## ----connect-show, eval=FALSE-------------------------------------------------
## library(tidyverse)
## library(mdsr)
## db <- dbConnect_scidb("airlines")
## flights <- tbl(db, "flights")
## carriers <- tbl(db, "carriers")


## checking the class of the created tbl objects
class(flights)
class(carriers)

## Now run the query called 'q' above, and use show_query to convert it into SQL code:
show_query(q)
## Note the 'translated' SQL code is a bit ugly, with temporary table names

######## The following code is a cleaner version of the SQL query:

## SELECT
##   c.name,
##   SUM(1) AS N,
##   SUM(arr_delay <= 15) / SUM(1) AS pct_ontime
## FROM flights AS f
## JOIN carriers AS c ON f.carrier = c.carrier
## WHERE year = 2016 AND month = 9
##   AND dest = 'JFK'
## GROUP BY name
## HAVING N >= 100
## ORDER BY pct_ontime DESC
## LIMIT 0,4;


## The 'translate_sql' function in the 'dbplyr' package translates simple R commands to SQL commands:
library(dbplyr)

con <- simulate_dbi()
translate_sql(mean(arr_delay, na.rm = TRUE),con=con)  # Slight change to book's code to avoid an error...


## The 'translate_sql' function doesn't work with all R functions, just the most basic ones.  It doesn't work with this copy of 'paste0':
my_paste <- paste0
translate_sql(my_paste("this", "is", "a", "string"),con=con)

# It does work with the original 'paste0' function, though:
translate_sql(paste0("this", "is", "a", "string"),con=con)


## This 'mutate' operation will not work:
carriers %>%
  mutate(name_code = my_paste(name, "(", carrier, ")"))
class(carriers)

## This 'mutate' operation will be translated correctly:
carriers %>%
  mutate(name_code = paste0(name, "(", carrier, ")"))
class(carriers)


## Or we can simply use the corresponding SQL command, CONCAT, directly:
carriers %>%
  mutate(name_code = CONCAT(name, "(", carrier, ")"))


## Using collect() allows the 'mutate' operation involving 'my_paste' to work, and the result returns a data frame:
carriers %>%
  collect() %>%
  mutate(name_code = my_paste(name, "(", carrier, ")"))


## Comparing the memory usage when we keep the table on the SQL server versus ...
carriers %>%
  object.size() %>%
  print(units = "Kb")

## ... when we pull the data into R:
carriers %>%
  collect() %>%
  object.size() %>%
  print(units = "Kb")


## How much memory does a data frame with 100 columns and 1 million rows take up?
n <- 100 * 1e6
x <- matrix(runif(n), ncol = 100)
dim(x)
print(object.size(x), units = "Mb")

# removing the object from the workspace now:
rm(x)


## Install and load package 'sqldf':
#install.packages("sqldf")
library(sqldf)


## SHOW TABLES;


## Describing the 'fields" (columns) in a table:

## DESCRIBE airports;  # This works in the MySQL implementation:

# This is the equivalent in SQLite:

sqldf("
PRAGMA table_info(airports);
")


## SELECT * FROM flights;


## SELECT * FROM flights LIMIT 0,10;

data(flights)  # load the 'flights' table into the workspace
sqldf("select * from flights limit 0,10")


## SELECT year, month, day, dep_time, sched_dep_time, dep_delay, origin

## FROM flights

## LIMIT 0, 10;


sqldf("SELECT year, month, day, dep_time, sched_dep_time, dep_delay, origin FROM flights LIMIT 0, 10")

### Selects two columns from 'airports':  the faa codes and the airport names:

## SELECT faa, name FROM airports;

sqldf("SELECT faa, name FROM airports")  

## SELECT

##   name,

##   CONCAT('(', lat, ', ', lon, ')')

## FROM airports

## LIMIT 0, 6;

sqldf("SELECT
name,
CONCAT('(', lat, ', ', lon, ')')
FROM airports
LIMIT 0, 6")

## The latitude/longitude column is correct in the table, but the column name is ugly.
## Let's rename it using AS:

## SELECT

##   name,

##   CONCAT('(', lat, ', ', lon, ')') AS coords

## FROM airports

## LIMIT 0, 6;

sqldf("SELECT
name,
CONCAT('(', lat, ', ', lon, ')') AS coords
FROM airports
LIMIT 0, 6")


## Let's rename 'name' to 'airport_name' using AS:

sqldf("SELECT
name AS airport_name,
CONCAT('(', lat, ', ', lon, ')') AS coords
FROM airports
LIMIT 0, 6")

### Using the WHERE clause to pick only the rows that meet some condition (changed the condition from the book example):

sqldf("SELECT
year, month, day, origin, dest,
flight, carrier
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6")


## This use of the SQLite function 'strftime' doesn't produce quite the right result:
sqldf("
SELECT
strftime('%Y-%m-%d', CONCAT(year, '-', month, '-', day)) AS theDate,
origin,
flight, carrier
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6
")

## It works OK without 'strftime':
sqldf("
SELECT
CONCAT(year, '-', month, '-', day) AS theDate,
origin, flight, carrier
FROM flights
WHERE theDate = '2013-6-26'
AND origin = 'LGA'
LIMIT 0, 6
")

## Note the difference between the BETWEEN and IN operators used in logical conditions:

sqldf("
SELECT
DISTINCT CONCAT(year, '-', month, '-', day)
AS theDate
FROM flights
WHERE year = 2013 AND month = 6 AND day BETWEEN 26 and 30
AND origin = 'LGA'
LIMIT 0, 6
")

sqldf("
SELECT
DISTINCT CONCAT(year, '-', month, '-', day)
AS theDate
FROM flights
WHERE year = 2013 AND month = 6 AND day IN (26, 30)
AND origin = 'LGA'
LIMIT 0, 6
")


## loading packages:
library(mdsr)
library(DBI)
db <- dbConnect_scidb("airlines")


### Note that in a WHERE clause, the AND clauses are always evaluated before the OR clauses:

## /* returns more records  */
## This returns records from the 26th of any month, not just June and not just 2013:
sqldf("
SELECT
COUNT(*) AS N
FROM flights
WHERE year = 2013 AND month = 6 OR day = 26
AND origin = 'LGA'
")

## /* returns fewer records  */
## This returns records from the 26th of any month or any records from June, but ONLY for flights in 2013:

sqldf("
SELECT
COUNT(*) AS N
FROM flights
WHERE year = 2013 AND (month = 6 OR day = 26)
AND origin = 'LGA';
")

## Use of GROUP BY to get counts by carrier:

sqldf("
SELECT
carrier,
COUNT(*) AS numFlights,
SUM(1) AS numFlightsAlso
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
GROUP BY carrier;
")


sqldf("
SELECT
carrier,
COUNT(*) AS numFlights,
MIN(dep_time)
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
 AND origin = 'LGA'
GROUP BY carrier;
")


## Doesn't work with SQLite functions, may work with regular SQL:
sqldf("
 SELECT
  carrier,
  COUNT(*) AS numFlights,
  MAKETIME(
     IF(LENGTH(MIN(dep_time)) = 3,
       LEFT(MIN(dep_time), 1),
       LEFT(MIN(dep_time), 2)
     ),
     RIGHT(MIN(dep_time), 2),
    0
    ) AS firstDepartureTime
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
GROUP BY carrier
LIMIT 0, 6;
")

## This shows the worst arrival delay, by airline, for 6/26/2013:
sqldf("
 SELECT
  carrier,
  COUNT(*) AS numFlights,
  MAX(arr_delay) AS WorstDelay
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
GROUP BY carrier
LIMIT 0, 6;
")

## This shows the means and variances of arrival delays, by airline, for all of 2013:
sqldf("
 SELECT
  carrier,
  COUNT(*) AS numFlights,
  AVG(arr_delay) AS DelayMean,
  ( COUNT(*)*SUM(arr_delay * arr_delay) - (SUM(arr_delay)*SUM(arr_delay)) ) / ( (COUNT(*)-1)*(COUNT(*)) ) AS DelayVariance
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY carrier
LIMIT 0, 15;
")

### Grouping by multiple columns:
## This shows the means of arrival delays, by airline/destination combination, for June 2013:
sqldf("
 SELECT
  carrier, dest,
  COUNT(*) AS numFlights,
  AVG(arr_delay) AS DelayMean
FROM flights
WHERE year = 2013 AND month = 6
AND origin = 'LGA'
GROUP BY carrier, dest
LIMIT 0, 40;
")

## Ordering the result with ORDER BY:
sqldf("
SELECT
dest, SUM(1) AS numFlights
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY dest
ORDER BY numFlights DESC
LIMIT 0, 10;
")


## Can also use ORDER BY without GROUP BY (this will print the results alphabetically by airport_name):
## Note it also works even though 'airport_name' is an alias we created:

sqldf("SELECT
name AS airport_name,
CONCAT('(', lat, ', ', lon, ')') AS coords
FROM airports
ORDER BY airport_name
LIMIT 0, 6")


### One more example of  ORDER BY without GROUP BY (this lists results in reverse alphabetical order by carrier):

sqldf("SELECT
year, month, day, origin, dest,
flight, carrier
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
ORDER BY carrier DESC
LIMIT 0, 16")

## Which destinations have the lowest average delay for 2013 flights from LaGuardia?

sqldf("
SELECT
dest, SUM(1) AS numFlights,
AVG(arr_delay) AS avg_arr_delay
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY dest
ORDER BY avg_arr_delay ASC
LIMIT 0, 6;
")

### Restricting the result set with HAVING:
## Among all destinations with at least 730 flights in 2013 (at least 2 flights per day), 
## which destinations have the lowest average delay for 2013 flights from LaGuardia?

sqldf("
SELECT
dest, SUM(1) AS numFlights,
AVG(arr_delay) AS avg_arr_delay
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY dest
HAVING numFlights > 365 * 2
ORDER BY avg_arr_delay ASC
LIMIT 0, 6;
")

# Note this HAVING clause works even though it's based on a derived column alias 
# (only true for some SQL implementations)


### The following does NOT work, because 'numFlights' and 'avg_arr_delay' are part of the result set,
### not part of the original data table, so we can't use 'numFlights' in the WHERE condition:

sqldf("
SELECT
dest, SUM(1) AS numFlights,
AVG(arr_delay) AS avg_arr_delay
FROM flights
WHERE year = 2013
AND origin = 'LGA'
AND numFlights > 365 * 2
GROUP BY dest
ORDER BY avg_arr_delay ASC
LIMIT 0, 6;
")

## Examples showing the usage of LIMIT:

sqldf("
SELECT
dest, SUM(1) AS numFlights,
AVG(arr_delay) AS avg_arr_delay
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY dest
HAVING numFlights > 365 * 2
ORDER BY avg_arr_delay ASC
LIMIT 0, 6;
")


sqldf("
SELECT
dest, SUM(1) AS numFlights,
AVG(arr_delay) AS avg_arr_delay
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY dest
HAVING numFlights > 365 * 2
ORDER BY avg_arr_delay ASC
LIMIT 5, 10;
")

sqldf("
SELECT
dest, SUM(1) AS numFlights,
AVG(arr_delay) AS avg_arr_delay
FROM flights
WHERE year = 2013
AND origin = 'LGA'
GROUP BY dest
HAVING numFlights > 365 * 2
ORDER BY avg_arr_delay ASC
LIMIT 0, 4;
")


#### Examples of Joins in SQL:

# A simple query:

sqldf("
SELECT
origin, dest,
flight, carrier
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6;
")
# This just lists the FAA airport code, not the full airport name.

# The 'airports' table has the FAA code and also the full airport name.
# If we join these tables together, we can get the full airport name for the destinations, along with the information on the flights.

# Note the JOIN does an INNER JOIN.

sqldf("
SELECT
origin, dest,
airports.name AS dest_name,
flight, carrier
FROM flights
JOIN airports ON flights.dest = airports.faa
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6;
")


## Using table aliases:  
## Making 'o' the alias for 'flights', and making 'a' the alias for 'airports':

sqldf("
SELECT
origin, dest,
a.name AS dest_name,
flight, carrier
FROM flights AS o
JOIN airports AS a ON o.dest = a.faa
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6;
")


# The 'airlines' table has the carrier code and also the full carrier (airline) name.
# If we join 'flights' and 'airlines' together, we can get the full carrier name for the destinations, along with the information on the flights.

# Note the book's code calls the table 'carriers', but the name here is actually 'airlines'...

sqldf("
SELECT
dest, a.name AS dest_name,
o.carrier, c.name AS carrier_name
FROM flights AS o
JOIN airports AS a ON o.dest = a.faa
JOIN airlines AS c ON o.carrier = c.carrier
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6;
")


sqldf("
SELECT
flight,
a2.name AS orig_name,
a1.name AS dest_name,
c.name AS carrier_name
FROM flights AS o
JOIN airports AS a1 ON o.dest = a1.faa
JOIN airports AS a2 ON o.origin = a2.faa
JOIN airlines AS c ON o.carrier = c.carrier
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA'
LIMIT 0, 6;
")

## Describing the 'fields" (columns) in a table:

## DESCRIBE airports;  # This works in the MySQL implementation:

# This is the equivalent in SQLite:

sqldf("
PRAGMA table_info(airports);
")

### A LEFT JOIN:
## This will capture any destination airports in the 'flights' table whose airport information is not available in the 'airports' table

sqldf("
SELECT
year, month, day, origin, dest,
a.name AS dest_name,
flight, carrier
FROM flights AS o
LEFT JOIN airports AS a ON o.dest = a.faa
WHERE year = 2013 AND month = 6 AND day = 26
AND a.name is null
LIMIT 0, 6;
")
# Note the destinations returned that don't have a match in the 'airports' table are all in Puerto Rico...


### Combining two separate queries with UNION:

sqldf("
SELECT
year, month, day, origin, dest,
flight, carrier
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'LGA' AND dest = 'MSP'
UNION
SELECT
year, month, day, origin, dest,
flight, carrier
FROM flights
WHERE year = 2013 AND month = 6 AND day = 26
AND origin = 'JFK' AND dest = 'ORD'
LIMIT 0,10;
")
## The LIMIT clause applies to the combined UNION, not to the second query
## Run without the LIMIT clause to see the complete UNION results


## Subqueries:

## Picking airports in time zones west of the contiguous U.S., such as Alaska and Hawaii:
# Change column name in book's code from 'city' to 'tzone':
sqldf("
SELECT faa, name, tz, tzone
FROM airports AS a
WHERE tz < -8
LIMIT 0, 6;
")

# Or SELECT * will return ALL the columns in the table:

sqldf("
SELECT *
FROM airports AS a
WHERE tz < -8
LIMIT 0, 6;
")


## Taking the results of some first query (about flights to the West Coast and beyond) and then doing another query on that,
## so that the first query is a *subquery*

sqldf("
SELECT
dest, a.name AS dest_name,
SUM(1) AS N, COUNT(distinct carrier) AS numCarriers
FROM flights AS o
LEFT JOIN airports AS a ON o.dest = a.faa
WHERE year = 2013
AND origin = 'JFK'
AND dest IN
 (SELECT faa
  FROM airports
  WHERE tz < -7)
GROUP BY dest;
")