email
ATP World Tour Website Scraping
May 12, 2020
2 minutes read

I had some requests regarding how I scraped the atptour.com’s past statistics on player rankings, so I thought I’d post the raw code and eventually turn this into a tutorial for advanced scraping.

library(tidyverse)
library(rvest)
library(lubridate)
library(hrbrthemes)
library(ggthemes)
theme_set(theme_ipsum_rc() +
          theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
          theme(axis.title.y = element_text(angle = 0)))

dates <- read_csv("C:/Users/nyulo/Documents/HBS/LM/atp_dates.csv", col_names = FALSE) %>% 
  select(date = 1) %>% 
  mutate(date = dmy(date),
         year = year(date)) %>% 
  filter(date >= "1970-01-01" & date < "1978-02-05") %>% 
  select(-year) %>% 
  mutate(date = as.character(date),
         date = date %>% str_replace_all("\\-","\\.")) %>% 
  pull()


counter <- 0
df_main <- tibble()

for (date in dates) {
  
  print(as.character(date))
  print(paste0((100*(counter / length(dates))),"% Finished"))
  link <- paste0("https://www.atptour.com/en/rankings/singles?rankDate=",
                  date,
                  "&rankRange=0-100")
   
  print("MADE LINK")

# SCRAPE MAIN TABLE
table_main <- link %>% 
  read_html() %>% 
  html_nodes("table") %>% 
  .[[1]] %>% 
  html_table() %>% 
  mutate(date = date)

print("MADE MAIN TABLE")

# GET LINKS TO EACH PLAYER
player_links <- link %>% 
  read_html() %>% 
  html_nodes("a") %>% 
  html_attr("href") %>% 
  str_subset("player") %>% 
  str_subset("overview")

table_main <- table_main %>% 
  cbind(player_links)

print("GOT PLAYER LINKS")
# SCRAPE PLAYER LINKS FOR INFO

df <- tibble()

player_urls <- paste0("https://www.atptour.com",player_links)

for (player_url in player_urls) {
  
player_info <- player_url[1] %>% 
  read_html() %>% 
  html_nodes("table") %>% 
  .[[1]] %>% 
  html_table(fill = TRUE) %>% 
  unlist() %>% 
  as_tibble() %>% 
  rownames_to_column() %>% 
  spread(key = rowname,value = value) %>% 
  select(age = 1, birthplace = 2, turned_pro = 3, residence = 4, weight = 5, strokes = 6, height = 7, coach = 8)

df <- rbind(df, player_info)

}

table_main <- table_main %>% cbind(df)
print("CBINDED INFO")
#df_main <- rbind(df_main, table_main)
#print("RBINDED INFO")
counter <- counter + 1

date <- date %>% str_replace_all("\\.","_")
write_csv(table_main, paste0("C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/tennis_",date,".csv"))
Sys.sleep(abs(rnorm(1,.2)))

}

df_new <- tibble()
component_files <- list.files("C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/data/1_raw/")
component_files

for (file in component_files) {
  csv <- read_csv(paste0("C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/data/1_raw/",file))
  df_new <- rbind(df_new, csv)
}

clean <- df_new %>% 
  mutate(one_hander = strokes %>% str_detect("One-Handed Backhand"),
         one_hander = ifelse(strokes %>% str_detect("Unknown Backhand"),NA,one_hander),
         date = ymd(date),
         year = year(date),
         ranking = parse_number(Ranking),
         rank_cohort = case_when(ranking <= 50 ~ "Rank 1 - 50",
                                 ranking >50 & ranking <= 100 ~ "Rank 51 - 100"))

save(clean, file = "C:/Users/nyulo/Documents/HBS/LM/tennis_rankings/data/atp_top_100.R")

Back to posts


comments powered by Disqus