library(tidyverse)
Generate the correct format string to parse each of the following dates and times:
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015) - 3:04PM", "July 1 (2015) - 4:04PM")
d5 <- "12/30/14" # Dec 30, 2014
t1 <- "1705" # 5:05PM
parse_date(d1, format = "%B %d, %Y")
[1] "2010-01-01"
parse_date(d2, format = "%Y-%b-%d")
[1] "2015-03-07"
parse_date(d3, format = "%d-%b-%Y")
[1] "2017-06-06"
parse_datetime(d4, format = "%B %d (%Y) - %I:%M%p")
[1] "2015-08-19 15:04:00 UTC" "2015-07-01 16:04:00 UTC"
parse_time(t1, format = "%H%M")
17:05:00
Download this NCHS dataset on leading Causes of death in the United States, from 1999 to 2015: https://data.cdc.gov/api/views/bi63-dtpu/rows.csv.
Then, import it into R. Are some of the colums the wrong type? If not is there any column that could be a factor instead of character type?
df <- read_csv("https://data.cdc.gov/api/views/bi63-dtpu/rows.csv")
Parsed with column specification:
cols(
Year = col_integer(),
`113 Cause Name` = col_character(),
`Cause Name` = col_character(),
State = col_character(),
Deaths = col_integer(),
`Age-adjusted Death Rate` = col_double()
)
df
dplyr
verbsLoad in the dataset movies.csv
used in the lecture:
url <- "https://raw.githubusercontent.com/Juanets/movie-stats/master/movies.csv"
movies <- read_csv(url)
Parsed with column specification:
cols(
budget = col_double(),
company = col_character(),
country = col_character(),
director = col_character(),
genre = col_character(),
gross = col_double(),
name = col_character(),
rating = col_character(),
released = col_character(),
runtime = col_integer(),
score = col_double(),
star = col_character(),
votes = col_integer(),
writer = col_character(),
year = col_integer()
)
movies
movies.sub <- filter(movies, year > 2010)
movies.sub
movies.sub <- select(movies.sub, name, director, year, country, genre, budget, gross, score)
movies.sub
round()
to round numbers.movies.sub <- mutate(movies.sub,
frac_profit = (gross - budget)/budget,
budget = round(budget/10^6, digits = 1),
gross = round(gross/10^6, digits = 1))
movies.sub
by_genre <- group_by(movies.sub, genre)
arrange(summarise(by_genre, count = n()), desc(count))
movies.summary <- movies.sub %>%
group_by(genre, country) %>%
summarise(count = n(),
median_profit = median(frac_profit),
mean_score = mean(score),
sd_score = sd(score)) %>%
filter(count > 10) %>%
arrange(desc(mean_score))
movies.summary
Using chaining and pipes, for each genre find the three directors with the top mean movie scores received for the movies produced after 2000, but do not include the directors with fewer than 3 movies in total. Hint: Use top_n()
function to select top n from each group.
top5_dir <- movies %>%
filter(year > 2000) %>%
group_by(genre, director) %>%
summarise(
mean_score = mean(score),
count = n()) %>%
filter(count >= 3) %>%
group_by(genre) %>%
top_n(5, wt = mean_score)
top5_dir
Pick your favourite genre and the top 3 directors to find movie recommendations for your next movie night!
class(top5_dir)
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
See that top5_dir
has a class ‘grouped_df’, so we convert it to a data frame first.
topDramaDir <- top5_dir %>%
as_data_frame() %>%
filter(genre == "Drama") %>%
select(director)
movies %>%
filter(genre == "Drama",
director %in% topDramaDir[["director"]]) %>%
select(name, director, year, score, genre, gross, budget) %>%
arrange(desc(score))