-
Notifications
You must be signed in to change notification settings - Fork 0
/
2. scrape_load_data.Rmd
74 lines (56 loc) · 2.08 KB
/
2. scrape_load_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
---
title: "Scrape Load Data"
output: html_document
---
Similar to our generation scraping script, this time around we're going to scrape and export data from NYISO's [load data archive](http://mis.nyiso.com/public/P-58Blist.htm) to our csvs directory.
```{r}
library(httr)
library(xml2)
library(stringr)
library(rlist)
archive_html <- httr::content(httr::GET('http://mis.nyiso.com/public/P-58Clist.htm'))
archive_html
```
Next we'll transform the HTML webpage into a list of archives:
```{r}
transform_anchor <- function(node) {
if(identical(nchar(xml_text(node)), integer(0))) {
return(NULL)
}
# Parse out the useful data
url <- paste("http://mis.nyiso.com/public/", xml_attr(node, "href"), sep = "")
is_archive <- grepl(".zip", url, fixed = TRUE)
date_components <- list.reverse(str_split(xml_text(node), "-")[[1]])
date <- paste(date_components, collapse="-")
year <- strtoi(date_components[1])
save_path <- paste("./scraped_data/zips/load/", date, ".zip", sep = "")
list(date = date, year = year, url = url, is_archive = is_archive, save_path = save_path)
}
all_links <- xml_find_all(archive_html, "//td/a")
available_data <- lapply(all_links, transform_anchor)
# Filter out realtime data since we don't really need that
archives <- Filter(function(x) { x$is_archive == TRUE }, available_data)
INTERESTING_YEARS <- c(2019, 2020, 2021)
# Filter out the years that aren't contained in that list
archives <- Filter(function(x) { x$year %in% INTERESTING_YEARS }, archives)
# Also filter out the latest archive, as that's likely an incomplete current month
archives <- archives[0:-1]
length(archives)
```
Let's download each of the zips:
```{r}
for (archive in archives) {
GET(archive$url, write_disk(archive$save_path, overwrite=TRUE))
print(paste("Downloaded", archive$save_path))
}
print("Done!")
```
... and finally, extract the zip files to csvs:
```{r}
output_to <- "./scraped_data/csvs/load"
unzip_archive <- function(archive) {
unzip(archive$save_path, exdir = output_to, overwrite = TRUE)
print(paste("Unzipped", archive$save_path))
}
lapply(archives, unzip_archive)
```