An Attempt to Scrape Wikipedia

This Post Involves my Attempt to Scrape and Clean the Output of a Wikipedia article to get the Same Data I had to Make by Hand.

Noah Milstein true
2022-03-05

list_of_wars_1000 <- "https://en.wikipedia.org/wiki/List_of_wars:_1000%E2%80%931499"

wars_1000s_df <- read_html(list_of_wars_1000) 

wars_1000s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(2) %>% html_table()
wars_1000s_subset <- wars_1000s[,1:5]
wars_1000s_subset
# A tibble: 58 × 5
   Start  Finish `Name of conflict`          Belligerents Belligerents
   <chr>  <chr>  <chr>                       <chr>        <chr>       
 1 Start  Finish Name of conflict            Victorious … "Defeated p…
 2 1000   1139   Norman conquest of souther… County of A… "Principali…
 3 1001   1001   Battle of Peshawar (1001)   Ghaznavids   "Kabul Shah…
 4 1002   1018   German–Polish War           Kingdom of … "Holy Roman…
 5 1008   1008   Hungarian–Ahtum War         Kingdom of … "Voivodeshi…
 6 1008   1008   Battle of Chach             Ghaznavids   "Kabul Shah…
 7 1007–8 1007–8 Battle at Herdaler          Kingdom of … "Finnish tr…
 8 1009   1031   Fitna of al-Andalus         Hammudid dy… "Caliphate …
 9 1010   1011   Second conflict in the Gor… Liao dynasty "Goryeo"    
10 1014   1014   Battle of Clontarf          High King o… "LeinsterDu…
# … with 48 more rows
wars_1100s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(3) %>% html_table()


wars_1100s_subset <- wars_1100s[,1:5]
wars_1100s_subset
# A tibble: 39 × 5
   Start          Finish    `Name of Confl…` Belligerents Belligerents
   <chr>          <chr>     <chr>            <chr>        <chr>       
 1 Start          Finish    Name of Conflict "Victorious… "Defeated p…
 2 Summer of 1101 Summer o… Crusade of 1101… "Sultanate … "Crusaders\…
 3 1101           1101      Battle of Ramla… "Kingdom of… "Fatimid Ca…
 4 1102           1102      Battle of Ramla… "Fatimids o… "Kingdom of…
 5 1107           1110      Norwegian Crusa… "Kingdom of… "Muslim Kin…
 6 1110           1110      Chola invasion … "Chola Empi… "Kalinga"   
 7 1113           1115      1113–15 Baleari… "Republic o… "Taifa of M…
 8 1122           1124      Venetian Crusad… "Republic o… "Fatimid Ca…
 9 1107           1119      Muhammad Tapar'… "Seljuq Emp… "Nizari Ism…
10 1123           1123      Kalmare ledungP… "Kingdom of… "Swedish Pa…
# … with 29 more rows
wars_1100s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(3) %>% html_table()


wars_1100s_subset <- wars_1100s[,1:5]
wars_1100s_subset
# A tibble: 39 × 5
   Start          Finish    `Name of Confl…` Belligerents Belligerents
   <chr>          <chr>     <chr>            <chr>        <chr>       
 1 Start          Finish    Name of Conflict "Victorious… "Defeated p…
 2 Summer of 1101 Summer o… Crusade of 1101… "Sultanate … "Crusaders\…
 3 1101           1101      Battle of Ramla… "Kingdom of… "Fatimid Ca…
 4 1102           1102      Battle of Ramla… "Fatimids o… "Kingdom of…
 5 1107           1110      Norwegian Crusa… "Kingdom of… "Muslim Kin…
 6 1110           1110      Chola invasion … "Chola Empi… "Kalinga"   
 7 1113           1115      1113–15 Baleari… "Republic o… "Taifa of M…
 8 1122           1124      Venetian Crusad… "Republic o… "Fatimid Ca…
 9 1107           1119      Muhammad Tapar'… "Seljuq Emp… "Nizari Ism…
10 1123           1123      Kalmare ledungP… "Kingdom of… "Swedish Pa…
# … with 29 more rows
wars_1200s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(4) %>% html_table()


wars_1200s_subset <- wars_1200s[,1:5]
wars_1200s_subset
# A tibble: 83 × 5
   Start Finish `Name of Conflict`           Belligerents Belligerents
   <chr> <chr>  <chr>                        <chr>        <chr>       
 1 Start Finish Name of Conflict             "Victorious… "Defeated p…
 2 1201  1219   War of the Antiochene Succe… "Forces of … "Forces of …
 3 1202  1204   Fourth CrusadePart of the C… "Holy Roman… "Byzantine …
 4 1204  1206   Intervention in Chaldia      "Kingdom of… "Byzantine …
 5 1202  1204   Anglo-Norman War (1202–04)   "Kingdom of… "Kingdom of…
 6 1202  1214   Anglo-French War of 1202–12… "Kingdom of… "Kingdom of…
 7 1203  1206   Loon War                     "Holland Ki… "Loon Franc…
 8 1204  1261   Bulgarian–Latin wars         "Bulgarian … "Latin Empi…
 9 1206  1337   Mongol invasions and conque… "Mongol Emp… "西夏 Weste…
10 1208  1209   Lombard Rebellion            "Latin Empi… "Rebel baro…
# … with 73 more rows
wars_1300s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(5) %>% html_table()


wars_1300s_subset <- wars_1300s[,1:5]
wars_1300s_subset
# A tibble: 79 × 5
   Start           Finish   `Name of Confl…` Belligerents Belligerents
   <chr>           <chr>    <chr>            <chr>        <chr>       
 1 Start           Finish   Name of Conflict Victorious … Defeated pa…
 2 1300            1301     Second Mongol i… Myinsaing K… Yuan dynasty
 3 1300            1300     Lembu Sora rebe… Majapahit E… Lembu Sora …
 4 c. 14th century c. 14th… K'aissape–Hvals… Inuit under… Norsemen un…
 5 1303            1303     Conquest of Syl… Independent… Gaur Kingdo…
 6 1308            1308     Teutonic takeov… Teutonic Kn… Margraviate…
 7 1309            1309     Crusade of the … Duchy of Br… Crusaders   
 8 1311            1312     Rebellion of ma… Władysław I… Kraków      
 9 1311            1318     Delhi–Seuna War  Delhi Sulta… Seuna Empire
10 1314            1318     Esen Buqa–Ayurb… Yuan dynast… Chagatai Kh…
# … with 69 more rows
wars_1400s<- wars_1000s_df %>% 
  html_nodes("table") %>% `[[`(6) %>% html_table()


wars_1400s_subset <- wars_1400s[,1:5]
wars_1400s_subset
# A tibble: 123 × 5
   Start Finish `Name of Conflict`           Belligerents Belligerents
   <chr> <chr>  <chr>                        <chr>        <chr>       
 1 Start Finish Name of Conflict             Victorious … "Defeated p…
 2 1400  1400   English invasion of Scotland Kingdom of … "Kingdom of…
 3 1400  1420   Glyndŵr Rising               Kingdom of … "Welsh rebe…
 4 1401  1404   First Samogitian Uprising    Teutonic St… "Grand Duch…
 5 1402  1402   Battle of Ankara             Timurid Emp… "Ottoman Em…
 6 1402  1413   Ottoman Interregnum          Faction of … "Faction of…
 7 1402  1496   Conquest of the Canary Isla… Union of Ca… "Guanches"  
 8 1403  1403   Percy Rebellion              Kingdom of … "English re…
 9 1404  1406   Paregreg war                 Western cou… "Eastern co…
10 1405  1405   Scrope Rebellion             Kingdom of … "English re…
# … with 113 more rows
list_of_wars_1500 <- "https://en.wikipedia.org/wiki/List_of_wars:_1500%E2%80%931799"

wars_1500s_df <- read_html(list_of_wars_1500) 

wars_1500s <- wars_1500s_df %>% 
  html_nodes("table") %>% `[[`(1) %>% html_table()

wars_1500s_subset <- wars_1500s[,1:5]

wars_1500s_subset
# A tibble: 161 × 5
   Start Finish `Name of conflict`           Belligerents Belligerents
   <chr> <chr>  <chr>                        <chr>        <chr>       
 1 Start Finish "Name of conflict"           Victorious … "Defeated p…
 2 1500  1503   "Second Muscovite–Lithuania… Grand Duchy… "Grand Duch…
 3 1500  1500   "Battle of Hemmingstedt"     Peasantry o… "Kalmar Uni…
 4 1501  1512   "Dano-Swedish War (1501–151… Sweden Free… "Kalmar Uni…
 5 1502  1510   "Persian–Uzbek wars"         Persian Emp… "Shaybanid …
 6 1502  1543   "Guelders Wars"              Holy Roman … "Duchy of G…
 7 1503  1505   "War of the Succession of L… Duchy of Ba… "Duchy of B…
 8 1505  1517   "Portuguese–Mamluk naval wa… Portugal     "Mamluk Sul…
 9 1507  1508   "Third Muscovite–Lithuanian… Grand Duchy… "Grand Duch…
10 1508  1516   "War of the League of Cambr… 1508–10: Pa… "1508–10: V…
# … with 151 more rows

Citation

For attribution, please cite this work as

Milstein (2022, March 5). Noah_Milstein_Blog: An Attempt to Scrape Wikipedia. Retrieved from https://nmilsteinuma.github.io/posts/2022-03-05-an-attempt-to-scrape-wikipedia/

BibTeX citation

@misc{milstein2022an,
  author = {Milstein, Noah},
  title = {Noah_Milstein_Blog: An Attempt to Scrape Wikipedia},
  url = {https://nmilsteinuma.github.io/posts/2022-03-05-an-attempt-to-scrape-wikipedia/},
  year = {2022}
}