- Web Crawling with R
- Example of crawling review data and stars from Version. This example is optimized into 2016 Version web page. This had used for writing publicized paper.
- ###crawling the user generated reviews from official web site of verison (US mobile phone carrier) 2016.06.06#
library(httr)
library(rvest)
library(XML)
library(RSelenium)checkForServer()
startServer()
chrome = remoteDriver(browserName=’chrome’) #make chrome browser#
chrome$open() #open chrome browsers7_review_lst <- list() #make list for saving data
cnt=1URL <- “http://www.verizonwireless.com/smartphones/samsung-galaxy-s7/” #put destination web address#
chrome$navigate(URL) #enter to URL through chrome#
while(T){
tryCatch({##give browser time to load every data on website.#
sys.sleep(30)h <- read_html(chrome$getPageSource()[[1]]) ##read html structure#
review_length = length(html_nodes(h, ‘li.bv-content-item.bv-content-top-review.bv-content-review’)) #check number of reviews in one page#
for(i in 1:review_length){
temp_lst <- list( #make temporary list to gather and check info before save#
rating = vector(),
ease_of_use = vector(),
design = vector(),
performance = vector(),
features = vector(),user_id = vector(),
generated_date = vector(),
title = vector(),
review = vector(),recommendation = vector(),
review_helpful_y = vector(),
review_helpful_n = vector(),pros_lst = vector(),
cons_lst = vector(),
previous_device_lst = vector()
)node_review = html_nodes(h, paste(‘li.bv-content-item.bv-content-top-review.bv-content-review:nth-child(‘,i,’)’,sep=””)) #access to ith review#
#######################main stars#######################
temp_lst$rating = as.numeric(substr(html_attr(html_nodes(node_review, “abbr.bv-rating-max.bv-rating-stars.bv-rating-stars-on”),”title”),1,1)) #####별점#######################sub stars#######################
secondary_review_length = length(html_nodes(node_review, “dt.bv-content-secondary-ratings-label”)) #repeatedly count sub stars and enter to variable#
if(secondary_review_length!=0){
for(k in 1:secondary_review_length){
label_name = html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label”)[k])if(label_name==”Ease of use”){
temp_lst$ease_of_use = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
}
else if(label_name==”Design”){
temp_lst$design = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
}
else if(label_name==”Performance”){
temp_lst$performance = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
}
else if(label_name==”Features”){
temp_lst$features = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
}
}
}#######################userID#######################
temp_lst$user_id = gsub(” “,””,html_text(html_nodes(node_review, “h3.bv-author”)))#######################Date#######################
temp_lst$generated_date = html_text(html_nodes(node_review, “span.bv-content-datetime-stamp”))#######################Title#######################
temp_lst$title = gsub(” “,””,html_text(html_nodes(node_review, “h4.bv-content-title”))) ##change multiple spaces to single space.#######################review#######################
temp_lst$review = html_text(html_nodes(node_review, “p”))#######################voting1#######################
temp_lst$review_helpful_y = as.numeric(gsub(‘ people found this review helpful. Click to agree.’,””,html_text(html_nodes(node_review, “button span.bv-off-screen”))[1]))#######################voting2#######################
temp_lst$review_helpful_n = as.numeric(gsub(‘ people did not find this review helpful. Click to agree.’,””,html_text(html_nodes(node_review, “button span.bv-off-screen”))[2]))#######################other extras#######################
content_data_length = length(html_nodes(node_review, “dt.bv-content-data-label-container”))
if(content_data_length!=0){
for(k in 1:content_data_length){
content_label_name = substring(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-data-label-container”)[k])), 2)if(content_label_name == “Yes,”){
temp_lst$recommendation = TRUE
}
else if(content_label_name == “No,”){
temp_lst$recommendation = FALSE
}
else if(content_label_name == “Pros:”){
temp_lst$pros_lst = gsub(“,”,””,html_text(html_nodes(node_review, “dl.bv-content-data-pros > dd.bv-content-data-value”)))
}
else if(content_label_name == “Cons:”){
temp_lst$cons_lst = gsub(“,”,””,html_text(html_nodes(node_review, “dl.bv-content-data-cons > dd.bv-content-data-value”)))
}
else if(content_label_name == “PreviousDevice”){
temp_lst$previous_device_lst = gsub(“,”,””,html_text(html_nodes(node_review, “dd.bv-content-data-value”)))
}
}
}
s7_review_lst[[cnt]] <- temp_lst
cnt = cnt + 1
}#after finish one page click to next page#
#html_nodes(h,”div.bv-content-list-container”)[1]#
nextBtn <- chrome$findElement(“css selector”,”span.bv-content-btn-pages-next”)
#find next button#
nextBtn$clickElement() #make to click#
##Click event#},error = function(e) {
print(‘no button’)
break #if it is impossible to find next button break#
})#try catch to handle error#}
length(s7_review_lst)
s7_review_lst[[]]$rating
s7_review_lst[[3]]$ease_of_use
s7_review_lst[[3]]$design
s7_review_lst[[3]]$performance
s7_review_lst[[3]]$features
s7_review_lst[[8005]]$user_id
s7_review_lst[[3]]$generated_date
s7_review_lst[[3]]$title
s7_review_lst[[3]]$review
s7_review_lst[[3]]$recommendation
s7_review_lst[[3]]$review_helpful_y
s7_review_lst[[3]]$review_helpful_n
s7_review_lst[[3]]$pros_lst
s7_review_lst[[3]]$cons_lst
s7_review_lst[[3]]$previous_device_lst##extract reviews#
review_vector<-vector()
for(i in 1: 4105){
review_vector <- c(review_vector,s7_review_lst[[i]]$review)
}##Saving#
save(review_vector,file=”verizon_s7_review.Rdata”)
load(“verizon_s7_review.Rdata”)
head(review_vector)save(s7_review_lst,file=”verizon_s7_all_data.Rdata”)
- ###crawling the user generated reviews from official web site of verison (US mobile phone carrier) 2016.06.06#
- Example of crawling review data and stars from Version. This example is optimized into 2016 Version web page. This had used for writing publicized paper.