# load packages
library(tidyverse)
library(dataverse)
library(httr)
# options
options(timeout = max(10*60, getOption("timeout")))
## Note: see https://github.com/IQSS/dataverse-client-r/issues/17 and
## https://github.com/IQSS/dataverse/issues/4373 for a couple of wierd
## issues with this particular dataverse repo. I think for some reason
## this breaks the dataverse::get_data() function
# set up
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
<- get_dataset("hdl:1902.1/FYXLAWZRIA")
dataset $files[c("filename", "contentType")]
dataset
# files to download from dataverse
<- c("Sectors of Source _ Target.txt",
filenames "Levels of Source _ Target.txt",
"Names of Source _ Target.txt",
"1990-1994 Data (N=2_679_938).tab",
"1995-1999 Data (N=4_108_102)",
"2000-2004 Data (N=3464898).tab")
# create a data frame with filenames and associated ids
<- dataset$files[, c("label", "id")] %>%
filenames_df rename(filename = label) %>%
filter(filename %in% filenames) %>%
mutate(download_url = paste0("https://dataverse.harvard.edu/api/access/datafile/", id)) %>%
glimpse()
# make the filenames compatible with make
$clean_filename <- filenames_df$filename %>%
filenames_df# convert to lower case
str_to_lower() %>%
# replace spaces with dashes
str_replace_all(" ", "-") %>%
# remove any extensions (the raw data have .tab, .txt, and missing extensions)
str_remove("\\.[:alnum:]*") %>%
# add a common.txt extension
str_c(".txt") %>%
# quick look
glimpse()
# set directory to save files; create if nec
<- "data/king-lowe-2008/"
dir if (!dir.exists(dir)) dir.create(dir)
# download and write each file
for (i in 1:length(filenames)) {
cat(paste0("Working on ",filenames_df$filename[i]))
# -?- # doesn't work: file <- get_file(filenames[i], "hdl:1902.1/FYXLAWZRIA", format = "original")
# -?- # not need bc above doesn't work: writeBin(file, paste0(dir, clean_filenames[i]))
# -?- # see here for more https://github.com/IQSS/dataverse/issues/4373
download.file(url = filenames_df$download_url[i], paste0(dir, filenames_df$clean_filename[i]))
}
Get the Raw Data
We need several data sets for this project.
- The IDEA events data (King and Lowe 2003) from Dataverse. These files are very large and there are several.
- The
states2016.csv
andsystem2016.csv
State System Membership data sets from the Correlates of War that we use to define the country-years.
The IDEA Events Data
The State System Membership
# states2016.csv
read_csv("https://correlatesofwar.org/wp-content/uploads/states2016.csv") %>%
write_csv("data/states2016.csv")
# system2016.csv
read_csv("https://correlatesofwar.org/wp-content/uploads/system2016.csv") %>%
write_csv("data/system2016.csv")