Louis Maiden
ATP World Tour Website Scraping
I had some requests regarding how I scraped the atptour.com’s past statistics on player rankings, so I thought I’d post the raw code and eventually turn this into a tutorial for advanced scraping.
library(tidyverse)
library(rvest)
library(lubridate)
library(hrbrthemes)
library(ggthemes)
theme_set(theme_ipsum_rc() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
theme(axis.title.y = element_text(angle = 0)))
dates <- read_csv("C:/Users/nyulo/Documents/HBS/LM/atp_dates.csv", col_names = FALSE) %>%
select(date = 1) %>%
mutate(date = dmy(date),
year = year(date)) %>%
filter(date >= "1970-01-01" & date < "1978-02-05") %>%
select(-year) %>%
mutate(date = as.character(date),
date = date %>% str_replace_all("\\-","\\.")) %>%
pull()
counter <- 0
df_main <- tibble()
for (date in dates) {
print(as.character(date))
print(paste0((100*(counter / length(dates))),"% Finished"))
link <- paste0("https://www.atptour.com/en/rankings/singles?rankDate=",
date,
"&rankRange=0-100")
print("MADE LINK")
# SCRAPE MAIN TABLE
table_main <- link %>%
read_html() %>%
html_nodes("table") %>%
.[[1]] %>%
html_table() %>%
mutate(date = date)
print("MADE MAIN TABLE")
# GET LINKS TO EACH PLAYER
player_links <- link %>%
read_html() %>%
html_nodes("a") %>%
html_attr("href") %>%
str_subset("player") %>%
str_subset("overview")
table_main <- table_main %>%
cbind(player_links)
print("GOT PLAYER LINKS")
# SCRAPE PLAYER LINKS FOR INFO
df <- tibble()
player_urls <- paste0("https://www.atptour.com",player_links)
for (player_url in player_urls) {
player_info <- player_url[1] %>%
read_html() %>%
html_nodes("table") %>%
.[[1]] %>%
html_table(fill = TRUE) %>%
unlist() %>%
as_tibble() %>%
rownames_to_column() %>%
spread(key = rowname,value = value) %>%
select(age = 1, birthplace = 2, turned_pro = 3, residence = 4, weight = 5, strokes = 6, height = 7, coach = 8)
df <- rbind(df, player_info)
}
table_main <- table_main %>% cbind(df)
print("CBINDED INFO")
#df_main <- rbind(df_main, table_main)
#print("RBINDED INFO")
counter <- counter + 1
date <- date %>% str_replace_all("\\.","_")
write_csv(table_main, paste0("C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/tennis_",date,".csv"))
Sys.sleep(abs(rnorm(1,.2)))
}
df_new <- tibble()
component_files <- list.files("C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/data/1_raw/")
component_files
for (file in component_files) {
csv <- read_csv(paste0("C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/data/1_raw/",file))
df_new <- rbind(df_new, csv)
}
clean <- df_new %>%
mutate(one_hander = strokes %>% str_detect("One-Handed Backhand"),
one_hander = ifelse(strokes %>% str_detect("Unknown Backhand"),NA,one_hander),
date = ymd(date),
year = year(date),
ranking = parse_number(Ranking),
rank_cohort = case_when(ranking <= 50 ~ "Rank 1 - 50",
ranking >50 & ranking <= 100 ~ "Rank 51 - 100"))
save(clean, file = "C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/data/atp_top_100.R")