# loading packages
::p_load(
pacman# functions for webscraping
rvest, # extend themes for ggplot2
ggthemes, # handle dates
lubridate, # manage file paths
here, # quick tables
flextable, # data wranglinga and visualization
tidyverse
)
# set theme for ggplot2
theme_set(
theme_hc(base_size = 18) + # base size
theme(
legend.position = "none", # No legend, unless specified in the plot
legend.title = element_blank(), # No title on the legend
axis.title.y = element_text(angle = 90) # spin y title 90 degrees
) )
Efficient, flexible and powerful!
Introduction
Web scraping is concept that most probably you have heard about. This is the art of harvesting publicly available data from a website for use in your analysis or reporting. Web scraping can be as simple as copying the contents of a website and pasting them on an excel sheet, but thatβs not what we are going to do today. Most website pages are built using HTML and this allows us to use tools such as R to dynamically extract the data.
In this tutorial, I am going walk you through how you can harvest data from websites using R programming language. You can do this by coding the logic and instructions manually or use the package {rvest}
to easily extract the contents of a website. We would demonstrate the examples using the below two websites.
- TheGlobalEconomy.com, Exchange rate to USD by country
- CRAN package website and package download metrics website
First, letβs load the packages/tools of trade for the exercise.
using rvest
This is the simplest and easiest approach of then all. rvest helps you scrape (or harvest) data from web pages and itβs designed to work with the pipe chain style of coding.
Three simples steps using this package:-
- Use
read_tml
function to convert the the url into anxml document
- pass the
xlm document
to the functionhtml_table
to convert it into a list contain table(s) from the url source site. - Now that you have tabular data, you can proceed with various data wrangling approaches to clean it up for use.
Dollar exchange data
I came across this website Exchange rate to USD by country while I was working on aggregating output from aneconomic micro-simulation model estimating benefits and budget impact of setting up fracture liaison services. Part of my reporting costs associated with the FLS, reported in local currency for about 10 different countries, but for harmonized reporting and international consumption, I need to convert the local currencies to US Dollars, hence I needed a source that I could cite and is updated regularly.
# get url from website
<- "https://www.theglobaleconomy.com/rankings/Dollar_exchange_rate/"
exchange_rate_url
# read HTML from website
<- read_html(exchange_rate_url) # read html site
exchange_rate_webpage
# create a datframe containing the exchange rates for use
<- exchange_rate_webpage %>%
currency_rates html_table() %>% # output a list contain the exchange rate table from website
as.data.frame() %>% # transform into a tibble
# rename columns appropriately
rename(
Country = Countries,
DateLatestData = "Reference",
LatesValue = "Latest.value",
Change3months = "Change.three.months",
Change12months = "Change.twelve.months"
%>%
) mutate(across(LatesValue, as.numeric), # convert to numeric column containing exchange rate
# On the date column; remove white space and replace "/" with "-"
DateLatestData = str_squish(
str_replace_all(
string = DateLatestData,
pattern = "/",
replacement = "-"
)%>%
)) # transform `DateLatestData` into a proper date column
mutate(
DateLatestData = my(DateLatestData), # proper date format
DateLatestData = format(DateLatestData, "%Y-%m") # YYYY-mm
)
The first 15 rows of the harvested data are show below
%>%
currency_rates head(15)
Country LatesValue DateLatestData Change3months Change12months
1 Afghanistan 70.7133 2024-08 -0.61% -17.45%
2 Albania 90.7575 2024-08 -2.10% -6.33%
3 Algeria 134.2240 2024-08 -0.18% -1.35%
4 Andorra 0.9073 2024-08 -1.68% -1.03%
5 Angola 891.8101 2024-08 5.40% 7.95%
6 Antigua and Barbuda 2.7026 2024-08 0.00% 0.00%
7 Argentina 940.2254 2024-08 5.86% 195.41%
8 Armenia 386.9613 2024-08 0.03% 0.33%
9 Aruba 1.8006 2024-08 -0.04% -0.03%
10 Australia 1.5030 2024-08 -0.15% -2.62%
11 Austria 0.9073 2024-08 -1.68% -1.03%
12 Azerbaijan 1.7000 2024-08 0.00% 0.00%
13 Bahamas 0.9986 2024-08 -0.07% -0.09%
14 Bahrain 0.3766 2024-08 0.00% -0.03%
15 Bangladesh 118.1472 2024-08 0.91% 8.15%
You can now export the harvested data for later use downstream or in your modelling framework like I did. You see, easy and straight forward, right?
# Export the data
::export(currency_rates, here("datasets", "dollar_exchange_rates.rds")) rio
Available CRAN packages
In this section, we are going to harvest data from two websites:-
- List of available CRAN packages by data of publication
- Tracked website of CRAN packages by number of downloads
- Additionally, to get some metrics to combine with data from the above 2 sites, we I use
available.packages
function for all cran packages in this mirror
CRAN packages by publication date
I will use the 3 simple steps already covered earlier, employing rvest
package.
# Specifying the url
<- 'https://cran.r-project.org/web/packages/available_packages_by_date.html'
url
# create a dataframe using `rvest` functions
<- read_html(url) %>%
r_pkgs_by_date html_table() %>%
as.data.frame() %>%
mutate(
Date = ymd(Date)
%>%
) rename(pkg_name = Package, date_published = Date)
# view top n rows
%>%
r_pkgs_by_date head(10)
date_published pkg_name
1 2024-09-08 datefixR
2 2024-09-08 EMCluster
3 2024-09-08 ibdsim2
4 2024-09-08 IsoriX
5 2024-09-08 L1pack
6 2024-09-08 matrixStats
7 2024-09-08 msm
8 2024-09-08 openxlsx2
9 2024-09-08 pedtools
10 2024-09-08 PMCMRplus
Title
1 Standardize Dates in Different Formats or with Missing Data
2 EM Algorithm for Model-Based Clustering of Finite Mixture\nGaussian Distribution
3 Simulation of Chromosomal Regions Shared by Family Members
4 Isoscape Computation and Inference of Spatial Origins using\nMixed Models
5 Routines for L1 Estimation
6 Functions that Apply to Rows and Columns of Matrices (and to\nVectors)
7 Multi-State Markov and Hidden Markov Models in Continuous Time
8 Read, Write and Edit 'xlsx' Files
9 Creating and Working with Pedigrees and Marker Data
10 Calculate Pairwise Multiple Comparisons of Mean Rank Sums\nExtended
CRAN packages metadata
If I need additional details, I can use the function available.packages()
; downside, it doesnβt contain date of publication, but some other metrics which are equally important are returned by this function. So, I ma going to do that below.
# using available.packages function
<- available.packages(#repos = "http://cran.us.r-project.org",
available_pks # specify CRAN mirror and metadata to extract
repos = "https://cran.r-project.org/")[, c("Version",
"Depends",
"Repository",
"NeedsCompilation",
"License")] %>%
as.data.frame() %>% tibble::rownames_to_column(var = "pkg_name")
# view top n rows
%>%
available_pks head(10)
pkg_name Version Depends
1 A3 1.0.0 R (>= 2.15.0), xtable, pbapply
2 AalenJohansen 1.0 <NA>
3 AATtools 0.0.3 R (>= 3.6.0)
4 ABACUS 1.0.0 R (>= 3.1.0)
5 abasequence 0.1.0 <NA>
6 abbreviate 0.1 <NA>
7 abc 2.2.1 R (>= 2.10), abc.data, nnet, quantreg, MASS, locfit
8 abc.data 1.1 R (>= 2.10)
9 ABC.RAP 0.9.0 R (>= 3.1.0)
10 ABCanalysis 1.2.1 R (>= 2.10)
Repository NeedsCompilation License
1 https://cran.r-project.org/src/contrib no GPL (>= 2)
2 https://cran.r-project.org/src/contrib no GPL (>= 2)
3 https://cran.r-project.org/src/contrib no GPL-3
4 https://cran.r-project.org/src/contrib no GPL-3
5 https://cran.r-project.org/src/contrib no GPL-3
6 https://cran.r-project.org/src/contrib no GPL-3
7 https://cran.r-project.org/src/contrib no GPL (>= 3)
8 https://cran.r-project.org/src/contrib no GPL (>= 3)
9 https://cran.r-project.org/src/contrib no GPL-3
10 https://cran.r-project.org/src/contrib no GPL-3
CRAN packages downloads
We can grab metric on number of downloads for each package in CRAN from this website.
<- "https://www.datasciencemeta.com/rpackages"
pkg_down_url
<- read_html(pkg_down_url) %>%
pkg_downloads html_table() %>%
as.data.frame() %>%
select(-c(Author, Maintainer)) %>% # remove columns with no data
mutate(
Downloads = str_remove_all(Downloads, pattern = ","), # get rid of commas
Downloads = as.integer(Downloads) # convert into an integer
%>%
) rename(pkg_name = "Package.Name", downloads = Downloads)
# view top n rows
%>%
pkg_downloads head(10)
Rank pkg_name downloads
1 1 ggplot2 147979440
2 2 rlang 136425392
3 3 magrittr 126139801
4 4 dplyr 111533228
5 5 vctrs 99097308
6 6 cli 96527305
7 7 tibble 93773928
8 8 devtools 91913477
9 9 jsonlite 91716968
10 10 Rcpp 88506647
Here is the fun part, we can now join all the three datasets telling different aspects of the packages to have one dataframe we can use for visualization and expropriation, how cool is thatπ
<- reduce(
CRAN_pkgs list(r_pkgs_by_date, pkg_downloads, available_pks), # reduce from purrr package
left_join,by = "pkg_name" # join by package_name
%>% arrange(Rank) %>%
) select(Rank, pkg_name, downloads, everything())
# view top n rows
%>%
CRAN_pkgs head(5) %>%
::reactable() reactable
For instance, as of the harvesting of this data, what are the top 20 most downloaded packages from CRAN. We can be able to summarize and visualize that information
%>%
CRAN_pkgs filter(Rank <= 20) %>%
::reactable() reactable
More fun exploration and data wrangling can be performed on the harvested data, feel free to play with the data further.
Export the combined dataset
# export data for playing around with
::export(CRAN_pkgs, here("datasets", "CRAN_pkgs.rds")) rio
Manually using readLines()
Sometimes, not all websites are structured in the manner that the dollar exchange website is. What this means is that the magic of the two functions above (read_html
and html_tabel
) do not work as expected.
In such scenarios, I am going to show you a manual way to still harvest data using the readLines
functions. I must admit that this is a very painful approach and is NOT standard nor reproducible as it depends on the state/structure of the website at the time of harvesting.
Read the webpage
The first step is to read the html code from the website using readLines
function.
# store url link
<- "https://www.theglobaleconomy.com/rankings/Dollar_exchange_rate/"
dollar_ex_url
# Reading the HTML code from the website
<- readLines(dollar_ex_url) dollaEx_webpage
Warning in readLines(dollar_ex_url): incomplete final line found on
'https://www.theglobaleconomy.com/rankings/Dollar_exchange_rate/'
head(dollaEx_webpage, 15)
[1] "<!DOCTYPE html>"
[2] "<html xmlns='https://www.w3.org/1999/xhtml' lang=\"en\" xml:lang='en' >"
[3] "\t<head>"
[4] "\t\t<meta name=\"viewport\" content=\"width=device-width, initial-scale=1, maximum-scale=1, user-scalable=0\"/>"
[5] "\t\t"
[6] "\t\t<link rel=\"canonical\" href=\"https://www.theglobaleconomy.com/rankings/Dollar_exchange_rate/\">"
[7] "\t\t"
[8] "\t\t\t\t<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"
[9] "\t\t"
[10] "\t\t<link rel=\"shortcut icon\" href=\"/favicon.ico\" type=\"image/x-icon\">"
[11] "\t\t<link rel=\"icon\" href=\"/favicon.ico\" type=\"image/x-icon\">"
[12] "\t\t"
[13] "\t\t<title>USD exchange rate around the world | TheGlobalEconomy.com</title>"
[14] "\t\t<meta name=\"Keywords\" content=\"USD exchange rate around the world\" />"
[15] "\t\t<meta name=\"Description\" content=\"What factors determine the exchange rates The exchange rate is the price of one currency expressed in units of another currency. We say that a currency is appreciating/depreciating relative to another currency if it takes more/less units of the other currency to purchase it. The exchange rate appreciates when the demand for the currency increases, i.e. when more people want to buy it and depreciates when the supply increases, i.e. when more people want to sell it. Look at it from the U.S. perspective. Foreigners want to buy dollars when they: 1.\tInvest in the U.S. 2.\tBuy goods and services from the U.S. At the same time U.S. citizens sell dollars when they: 1.\tInvest overseas. 2.\tBuy foreign goods and services. Holding all else constant, if foreign investment in the U.S. increases, this will create more demand for dollars, and the dollar will appreciate. Similarly, when the U.S. increases its international purchases and investments, that creates additional supply of dollars and the dollar will depreciate. Several examples U.S. interest rates increase. That leads to greater foreign investment into the U.S., greater demand for dollars, and dollar appreciation. New tariffs on U.S. exports to Europe. Now foreigners buy fewer U.S. goods, the demand for dollars declines, and the dollar depreciates. The U.S. economy is growing rapidly. On one hand, that attracts foreign investment and causes dollar appreciation. On the other hand, U.S. imports increase since Americans can buy more international goods and services and cause dollar depreciation. We are not sure which effect would be stronger but usually a growing economy is associated with an appreciating currency. Expected appreciation of the dollar. If for some reason investors believe that the dollar will appreciate in one year, they will want to buy it now. This creates additional demand for dollars and leads to appreciation now. If you wonder what the effect of some macroeconomic variable on the exchange rate is, ask yourself how, holding everything else constant, that variable affects the demand or the supply of the currency. Based on that, you can determine if the change is likely to lead to appreciation or depreciation. Forecasting currency values: short, medium, and long run Of course, in reality there are multiple macroeconomic changes occurring at the same time. Interest rates may be changing in various countries, GDP growth rates differ, trade flows shift, etc. The combination of all these factors makes it virtually impossible to predict the exchange rate at short horizons (days, weeks). Too many fundamentals are moving in different directions at the same time. Therefore, economists say that the exchange rate follows a βrandom walkβ, i.e. the best forecast of tomorrowβs exchange rate is todayβs exchange rate. The likelihood of appreciation and depreciation are the same. At medium-term time horizons (months, a few years) the effect of particular macroeconomic fundamentals on the exchange rate becomes more visible. For example, if the economy is growing rapidly its currency is likely to appreciate as the country attracts international capital that creates demand for the local money. In contrast, in the long-run (many years, decades) macroeconomic fundamentals cease to have a role. Over such long periods of time, the economic growth rates, interest rates, as well as international trade and investment flows have stabilized at some βaverageβ levels. Then, the only determinant of the exchange rate is the rate of money supply growth. Countries that print money more rapidly than the rest of the world will experience currency depreciation. In fact, the size of the depreciation would be equal to the increase of the money supply. Recall that the exchange rate is the price of a currency. If you create too much of it the price will come down. Additional resources Information about the European Monetary Union with a lot of historic information, analyses and data. Available from: European Central Bank The International Monetary Fund, a wealth of information about all member countries and about international financial markets. Available from: IMF.\" />"
The dollaEx_webpage
is basically a HTML page with all the usual html tags and stuff. So, we are going refer to the website for a start point to give us reference for first entry and last entry of the page. From experience, I have found focusing on the actual data saves time and very easy to proceed forth as opposed to randomly picking any entry on the page.
For instance, Afghanistan is the first entry on the table and Zambia is the last entry, this gives me an idea of the bound of the data that I want. Ideally I need everything between entry one and last entry and I am going to show you how.
Search for data bounds
Now, the manual work starts here. From the source website, I want to find the index or indices of Afghanistan
, my first data point.
str_which(dollaEx_webpage, "Afghanistan")
[1] 949
What this means is that the key word Afghanistan has been found in position 922 (sometimes multiple positions get returned). We can try to triangulate around these positions and see what weβve got going. There is not rule for triangulation, you just throw the net and inspect the output, thatβs why itβs a manual approach.
From the dollaEx_webpage
object and with the index of my first data point, I start the triangulation process. For instance, let me view the next 10 lines after position 922
922:932] # first 10 lines dollaEx_webpage[
[1] "\t\t\tSelect indicator\t\t</a>"
[2] "\t\t<div style=\"font-size: 0.8em;text-align: center;\">* indicates monthly or quarterly data series</div>"
[3] "\t</div>"
[4] ""
[5] "\t<div style=\"float: left; width: 35px; height: 20px;\" class=\"more850px\"></div>"
[6] "\t"
[7] "\t<div style=\"width: 98%; max-width: 750px; text-align: justify; float:left; vertical-align:top; margin: 10px 0 0 5px\">"
[8] "\t\t<div style=\"width: 100%; overflow: auto;\">"
[9] "\t\t\tThe amount of local currency units that can be exchanged for one USD. An increase (decrease) means USD appreciation (depreciation). The values are monthly averages of the daily exchange rates provided by Google Finance. "
[10] "\t\t\t</br></br>"
[11] "\t\t\t<div style=\"width: auto; max-width: 300px; float: left; margin: 0 0 0 0px\">"
Clearly from the above output, if I were interested in data for Afghanistan
only, i.e. the 5 columns from the table in the website, they are contained between positions 922 and 926, see below.
922:926] dollaEx_webpage[
[1] "\t\t\tSelect indicator\t\t</a>"
[2] "\t\t<div style=\"font-size: 0.8em;text-align: center;\">* indicates monthly or quarterly data series</div>"
[3] "\t</div>"
[4] ""
[5] "\t<div style=\"float: left; width: 35px; height: 20px;\" class=\"more850px\"></div>"
NOTE, I am ONLY interested at the start and end of the data string containing the columns a.ka. data that I need. Do the same for the last entry on the table (in the website), which in this case is Zambia.
# Last entry
str_which(dollaEx_webpage, "Zambia")
[1] 2279
NOTE, I am interested at the start and end of the data string. The index for Zambia is 2252, based on what I did with the first entry on the table, it simply means that the data bound for Zambia is contained within 4 steps/lines from 2252. See below.
2252:2256] dollaEx_webpage[
[1] "\t\t\t\t\t\t<td>12631.2350</td>"
[2] "\t\t\t\t\t\t<td>Aug / 2024</td>"
[3] "\t\t\t\t\t\t\t\t\t\t\t\t<td>-0.51%</td>"
[4] "\t\t\t\t\t\t<td>5.74%</td>"
[5] "\t\t\t\t\t\t\t\t\t\t\t</tr>"
After carefully inspecting the above outputs, I can conclude that the data I am looking for (the whole table) lies between 922 to 2256. Anything before or after contain nothing other than β tags only.
Define whole data bounds
After arriving at this conclusion, then next step is to now narrow your webpage to the data bounds only i.e.
# resize your webpage to only contain the data
<- dollaEx_webpage[922:2256]
dollaEx_webData
head(dollaEx_webData, 15)
[1] "\t\t\tSelect indicator\t\t</a>"
[2] "\t\t<div style=\"font-size: 0.8em;text-align: center;\">* indicates monthly or quarterly data series</div>"
[3] "\t</div>"
[4] ""
[5] "\t<div style=\"float: left; width: 35px; height: 20px;\" class=\"more850px\"></div>"
[6] "\t"
[7] "\t<div style=\"width: 98%; max-width: 750px; text-align: justify; float:left; vertical-align:top; margin: 10px 0 0 5px\">"
[8] "\t\t<div style=\"width: 100%; overflow: auto;\">"
[9] "\t\t\tThe amount of local currency units that can be exchanged for one USD. An increase (decrease) means USD appreciation (depreciation). The values are monthly averages of the daily exchange rates provided by Google Finance. "
[10] "\t\t\t</br></br>"
[11] "\t\t\t<div style=\"width: auto; max-width: 300px; float: left; margin: 0 0 0 0px\">"
[12] "\t\t\t\t<a href=\"https://www.theglobaleconomy.com/download-data.php\" target=\"_blank\" style=\"font-weight: bold;\">Download data from our database</a>"
[13] "<!-- \t\t\t<a href=\"https://www.theglobaleconomy.com/data_feed_api.php\" target=\"_blank\" style=\"color: #e2bb04; display: inline-block; margin: 0 0 0 20px;\">API</a>-->"
[14] "\t\t\t</div>"
[15] "\t\t\t</br></br>"
If you widely look at the object dollaEx_webData
, it contains a lot of unnecessary β tags, so now itβs time to clean this object carefully ind order to obtain the data.
Cleanup process
First get rid of all the "\t\t\t\t\t\t\t\t\t\t\t\t\t"
# remove ALL the "\t"
<- dollaEx_webData %>% str_remove_all(pattern = "\t")
dollaEx_clean
head(dollaEx_clean,25)
[1] "Select indicator</a>"
[2] "<div style=\"font-size: 0.8em;text-align: center;\">* indicates monthly or quarterly data series</div>"
[3] "</div>"
[4] ""
[5] "<div style=\"float: left; width: 35px; height: 20px;\" class=\"more850px\"></div>"
[6] ""
[7] "<div style=\"width: 98%; max-width: 750px; text-align: justify; float:left; vertical-align:top; margin: 10px 0 0 5px\">"
[8] "<div style=\"width: 100%; overflow: auto;\">"
[9] "The amount of local currency units that can be exchanged for one USD. An increase (decrease) means USD appreciation (depreciation). The values are monthly averages of the daily exchange rates provided by Google Finance. "
[10] "</br></br>"
[11] "<div style=\"width: auto; max-width: 300px; float: left; margin: 0 0 0 0px\">"
[12] "<a href=\"https://www.theglobaleconomy.com/download-data.php\" target=\"_blank\" style=\"font-weight: bold;\">Download data from our database</a>"
[13] "<!-- <a href=\"https://www.theglobaleconomy.com/data_feed_api.php\" target=\"_blank\" style=\"color: #e2bb04; display: inline-block; margin: 0 0 0 20px;\">API</a>-->"
[14] "</div>"
[15] "</br></br>"
[16] "<table id=\"benchmarkTable\" class=\"sortable\" style=\"width: 100%; max-width: 750px;\">"
[17] "<thead>"
[18] "<tr style=\"height: 40px;\">"
[19] "<th>Countries</th>"
[20] "<th>Latest value</th>"
[21] "<th>Reference</th>"
[22] "<th>Change three months</th>"
[23] "<th>Change twelve months</th>"
[24] "</tr>"
[25] "</thead>"
The object dollaEx_webData
basically contains unique strings that contain data from the column of the table in the url.
If you look closely, you will see a recurring pattern of strings, each starting with ""<td><a href='/"
and ending with a double "</tr>"
; and ideally, this is the trick. The ability to understand the output and content of dollaEx_clean.
If you then count these start and end of recurring patterns, you find that they are 7 in number, so a single country data is composed of 7 unique strings Try viewing the first 25 rows of dollaEx_clean
, you will be able to see what I mean.
Take these unique strings and turn them into a matrix with each column containing the vital pieces of information and drop what is NOT required.
<- matrix(dollaEx_clean,
dollaEx_matrix ncol = 7, # 7 columns because there were 7 unique strings
byrow = TRUE)
Warning in matrix(dollaEx_clean, ncol = 7, byrow = TRUE): data length [1335] is
not a sub-multiple or multiple of the number of rows [191]
head(dollaEx_matrix, 4)
[,1]
[1,] "Select indicator</a>"
[2,] "<div style=\"width: 100%; overflow: auto;\">"
[3,] "</br></br>"
[4,] "<th>Change three months</th>"
[,2]
[1,] "<div style=\"font-size: 0.8em;text-align: center;\">* indicates monthly or quarterly data series</div>"
[2,] "The amount of local currency units that can be exchanged for one USD. An increase (decrease) means USD appreciation (depreciation). The values are monthly averages of the daily exchange rates provided by Google Finance. "
[3,] "<table id=\"benchmarkTable\" class=\"sortable\" style=\"width: 100%; max-width: 750px;\">"
[4,] "<th>Change twelve months</th>"
[,3]
[1,] "</div>"
[2,] "</br></br>"
[3,] "<thead>"
[4,] "</tr>"
[,4]
[1,] ""
[2,] "<div style=\"width: auto; max-width: 300px; float: left; margin: 0 0 0 0px\">"
[3,] "<tr style=\"height: 40px;\">"
[4,] "</thead>"
[,5]
[1,] "<div style=\"float: left; width: 35px; height: 20px;\" class=\"more850px\"></div>"
[2,] "<a href=\"https://www.theglobaleconomy.com/download-data.php\" target=\"_blank\" style=\"font-weight: bold;\">Download data from our database</a>"
[3,] "<th>Countries</th>"
[4,] "<tbody>"
[,6]
[1,] ""
[2,] "<!-- <a href=\"https://www.theglobaleconomy.com/data_feed_api.php\" target=\"_blank\" style=\"color: #e2bb04; display: inline-block; margin: 0 0 0 20px;\">API</a>-->"
[3,] "<th>Latest value</th>"
[4,] "<tr>"
[,7]
[1,] "<div style=\"width: 98%; max-width: 750px; text-align: justify; float:left; vertical-align:top; margin: 10px 0 0 5px\">"
[2,] "</div>"
[3,] "<th>Reference</th>"
[4,] "<td><a href='/Afghanistan/Dollar_exchange_rate/' style=\"color:#000;\">Afghanistan</a></td>"
The matrix contains 7 columns (7 uniques strings, remember?). Information that we are most interested in (i.e the columns of the table in the url) are contained in the following columns of the matrix.
- column 1 contains country
- column 3 contains latest data from
- column 2 contains latest value
- column 4 contains change 3 months
- column 5 contains change 12 months
With this information, we can now construct the final clean matrix by droping column 6 and 7 which contains nothing we are interested in.
# drop columns that don't contain information of interest
<- dollaEx_matrix[, c(1, 2, 3, 4, 5)]
dollaEx_matrix_clean
head(dollaEx_matrix_clean, 5)
[,1]
[1,] "Select indicator</a>"
[2,] "<div style=\"width: 100%; overflow: auto;\">"
[3,] "</br></br>"
[4,] "<th>Change three months</th>"
[5,] "<td>70.7133</td>"
[,2]
[1,] "<div style=\"font-size: 0.8em;text-align: center;\">* indicates monthly or quarterly data series</div>"
[2,] "The amount of local currency units that can be exchanged for one USD. An increase (decrease) means USD appreciation (depreciation). The values are monthly averages of the daily exchange rates provided by Google Finance. "
[3,] "<table id=\"benchmarkTable\" class=\"sortable\" style=\"width: 100%; max-width: 750px;\">"
[4,] "<th>Change twelve months</th>"
[5,] "<td>Aug / 2024</td>"
[,3]
[1,] "</div>"
[2,] "</br></br>"
[3,] "<thead>"
[4,] "</tr>"
[5,] "<td>-0.61%</td>"
[,4]
[1,] ""
[2,] "<div style=\"width: auto; max-width: 300px; float: left; margin: 0 0 0 0px\">"
[3,] "<tr style=\"height: 40px;\">"
[4,] "</thead>"
[5,] "<td>-17.45%</td>"
[,5]
[1,] "<div style=\"float: left; width: 35px; height: 20px;\" class=\"more850px\"></div>"
[2,] "<a href=\"https://www.theglobaleconomy.com/download-data.php\" target=\"_blank\" style=\"font-weight: bold;\">Download data from our database</a>"
[3,] "<th>Countries</th>"
[4,] "<tbody>"
[5,] "</tr>"
Now construct a tibble from the above matrix and this will be our final product of the table from the url
# final table
<- tibble(
exchange_data Country = dollaEx_matrix_clean[, 1],
LatestData = dollaEx_matrix_clean[, 3],
LatesValue = dollaEx_matrix_clean[, 2],
Change3months = dollaEx_matrix_clean[ ,4],
Change12months = dollaEx_matrix_clean[ ,5]
)
head(exchange_data)
# A tibble: 6 Γ 5
Country LatestData LatesValue Change3months Change12months
<chr> <chr> <chr> <chr> <chr>
1 "Select indicator</a>" </div> "<div styβ¦ "" "<div style=\β¦
2 "<div style=\"width: 100%;β¦ </br></br> "The amouβ¦ "<div style=β¦ "<a href=\"htβ¦
3 "</br></br>" <thead> "<table iβ¦ "<tr style=\β¦ "<th>Countrieβ¦
4 "<th>Change three months</⦠</tr> "<th>Chan⦠"</thead>" "<tbody>"
5 "<td>70.7133</td>" <td>-0.61β¦ "<td>Aug β¦ "<td>-17.45%β¦ "</tr>"
6 "<td>90.7575</td>" <td>-2.10β¦ "<td>Aug β¦ "<td>-6.33%<β¦ "</tr>"
Further clean up process for the columns (They donβt contain usable date yet), so many html tags that needs to be removed.
<- exchange_data %>%
exchange_data mutate(
Country = str_remove(Country, pattern = "/Dollar_exchange_rate/.*"), # remove all characters after "/Dollar_exchange_rate/"
Country = str_remove(Country, pattern = "<td><a href='/") # remove the start tag for each value of country
)
head(exchange_data, 5)
# A tibble: 5 Γ 5
Country LatestData LatesValue Change3months Change12months
<chr> <chr> <chr> <chr> <chr>
1 "Select indicator</a>" </div> "<div styβ¦ "" "<div style=\β¦
2 "<div style=\"width: 100%;β¦ </br></br> "The amouβ¦ "<div style=β¦ "<a href=\"htβ¦
3 "</br></br>" <thead> "<table iβ¦ "<tr style=\β¦ "<th>Countrieβ¦
4 "<th>Change three months</⦠</tr> "<th>Chan⦠"</thead>" "<tbody>"
5 "<td>70.7133</td>" <td>-0.61β¦ "<td>Aug β¦ "<td>-17.45%β¦ "</tr>"
The rest contain almost similar pattern, we are going to clean all them at a go. Basically remove the patterns/tags <td>
and </td>
from our columns.
<- exchange_data %>%
exchange_data mutate(across(.cols = c(LatestData:Change12months),
.fns = ~ str_remove_all(., pattern = "<td>|</td>") # remove the patterns "<td>|</td>"
%>%
)) mutate(across(LatesValue, as.numeric), # convert to numeric column containing exchange rate
# On the date column; remove white space and replace "/" with "-"
LatestData = str_squish(
str_replace_all(
string = LatestData,
pattern = "/",
replacement = "-"
)%>%
)) # transform `LatestData` into a proper date column
mutate(
LatestData = my(LatestData), # proper date format
LatestData = format(LatestData, "%Y-%m") # YYYY-mm
)
Warning: There was 1 warning in `mutate()`.
βΉ In argument: `across(LatesValue, as.numeric)`.
Caused by warning:
! NAs introduced by coercion
Warning: There was 1 warning in `mutate()`.
βΉ In argument: `LatestData = my(LatestData)`.
Caused by warning:
! 191 failed to parse.
%>%
exchange_data tail(6)
# A tibble: 6 Γ 5
Country LatestData LatesValue Change3months Change12months
<chr> <chr> <dbl> <chr> <chr>
1 <td>3717.0528</td> <NA> NA 0.74% </tr>
2 <td>41.1613</td> <NA> NA 11.54% </tr>
3 <td>3.6730</td> <NA> NA 0.00% </tr>
4 <td>0.7726</td> <NA> NA -1.88% </tr>
5 <td>40.3301</td> <NA> NA 6.49% </tr>
6 <td>12631.2350</td> <NA> NA 5.74% </tr>
NOTE: The downside of this manual approach is that it always depends on the website not changing structure, for instance if we had other countries added after Zambia, then our code above would not be able to extract that extra data sadly.
Session info
::session_info() sessioninfo
β Session info βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
setting value
version R version 4.3.2 (2023-10-31)
os macOS Sonoma 14.6.1
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/London
date 2024-09-08
pandoc 3.1.11 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
β Packages βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
package * version date (UTC) lib source
askpass 1.2.0 2023-09-03 [1] CRAN (R 4.3.0)
cli 3.6.3 2024-06-21 [1] CRAN (R 4.3.3)
colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.3.3)
crayon 1.5.3 2024-06-20 [1] CRAN (R 4.3.3)
crosstalk 1.2.1 2023-11-23 [1] CRAN (R 4.3.1)
crul 1.5.0 2024-07-19 [1] CRAN (R 4.3.3)
curl 5.2.1 2024-03-01 [1] CRAN (R 4.3.1)
data.table 1.15.4 2024-03-30 [1] CRAN (R 4.3.1)
digest 0.6.37 2024-08-19 [1] CRAN (R 4.3.3)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.3.1)
evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.3.3)
fansi 1.0.6 2023-12-08 [1] CRAN (R 4.3.1)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.3.3)
flextable * 0.9.6 2024-05-05 [1] CRAN (R 4.3.1)
fontBitstreamVera 0.1.1 2017-02-01 [1] CRAN (R 4.3.0)
fontLiberation 0.1.0 2016-10-15 [1] CRAN (R 4.3.0)
fontquiver 0.2.1 2017-02-01 [1] CRAN (R 4.3.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
gdtools 0.3.7 2024-03-05 [1] CRAN (R 4.3.1)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
gfonts 0.2.0 2023-01-08 [1] CRAN (R 4.3.0)
ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.3.1)
ggthemes * 5.1.0 2024-02-10 [1] CRAN (R 4.3.1)
glue 1.7.0 2024-01-09 [1] CRAN (R 4.3.1)
gtable 0.3.5 2024-04-22 [1] CRAN (R 4.3.1)
here * 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.3.1)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.3.1)
httpcode 0.3.0 2020-04-10 [1] CRAN (R 4.3.0)
httpuv 1.6.15 2024-03-26 [1] CRAN (R 4.3.1)
httr 1.4.7 2023-08-15 [1] CRAN (R 4.3.0)
jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.3.1)
knitr 1.48 2024-07-07 [1] CRAN (R 4.3.3)
later 1.3.2 2023-12-06 [1] CRAN (R 4.3.1)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.3.1)
lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.3.1)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
mime 0.12 2021-09-28 [1] CRAN (R 4.3.0)
munsell 0.5.1 2024-04-01 [1] CRAN (R 4.3.1)
officer 0.6.6 2024-05-05 [1] CRAN (R 4.3.1)
openssl 2.2.1 2024-08-16 [1] CRAN (R 4.3.3)
pacman 0.5.1 2019-03-11 [1] CRAN (R 4.3.0)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
promises 1.3.0 2024-04-05 [1] CRAN (R 4.3.1)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.3.0)
R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.3.0)
R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.3.1)
R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.3.1)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
ragg 1.3.2 2024-05-15 [1] CRAN (R 4.3.3)
Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.3.3)
reactable 0.4.4 2023-03-12 [1] CRAN (R 4.3.0)
reactR 0.5.0 2023-10-11 [1] CRAN (R 4.3.1)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.3.1)
rio 1.2.2 2024-08-19 [1] CRAN (R 4.3.3)
rlang 1.1.4 2024-06-04 [1] CRAN (R 4.3.3)
rmarkdown 2.28 2024-08-17 [1] CRAN (R 4.3.3)
rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.3.1)
rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.3.1)
rvest * 1.0.4 2024-02-12 [1] CRAN (R 4.3.1)
scales 1.3.0 2023-11-28 [1] CRAN (R 4.3.1)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
shiny 1.9.1 2024-08-01 [1] CRAN (R 4.3.3)
stringi 1.8.4 2024-05-06 [1] CRAN (R 4.3.1)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.3.1)
systemfonts 1.1.0 2024-05-15 [1] CRAN (R 4.3.3)
textshaping 0.4.0 2024-05-24 [1] CRAN (R 4.3.3)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.3.1)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.3.1)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.0)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.3.1)
tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0)
utf8 1.2.4 2023-10-22 [1] CRAN (R 4.3.1)
uuid 1.2-1 2024-07-29 [1] CRAN (R 4.3.3)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.3.1)
withr 3.0.1 2024-07-31 [1] CRAN (R 4.3.3)
xfun 0.47 2024-08-17 [1] CRAN (R 4.3.3)
xml2 1.3.6 2023-12-04 [1] CRAN (R 4.3.1)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.3.0)
yaml 2.3.10 2024-07-26 [1] CRAN (R 4.3.3)
zip 2.3.1 2024-01-27 [1] CRAN (R 4.3.1)
[1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ