Projects

  1. Web Crawling with R
    • Example of crawling review data and stars from Version. This example is optimized into 2016 Version web page. This had used for writing publicized paper.
      • ###crawling the user generated reviews from official web site of verison (US mobile phone carrier) 2016.06.06#
        library(httr)
        library(rvest)
        library(XML)
        library(RSelenium)checkForServer()
        startServer()
        chrome = remoteDriver(browserName=’chrome’) #make chrome browser#
        chrome$open() #open chrome browser

        s7_review_lst <- list() #make list for saving data
        cnt=1

        URL <- “http://www.verizonwireless.com/smartphones/samsung-galaxy-s7/” #put destination web address#

        chrome$navigate(URL) #enter to URL through chrome#

        while(T){
        tryCatch({

        ##give browser time to load every data on website.#
        sys.sleep(30)

        h <- read_html(chrome$getPageSource()[[1]]) ##read html structure#

        review_length = length(html_nodes(h, ‘li.bv-content-item.bv-content-top-review.bv-content-review’)) #check number of reviews in one page#

        for(i in 1:review_length){
        temp_lst <- list( #make temporary list to gather and check info before save#
        rating = vector(),
        ease_of_use = vector(),
        design = vector(),
        performance = vector(),
        features = vector(),

        user_id = vector(),
        generated_date = vector(),
        title = vector(),
        review = vector(),

        recommendation = vector(),
        review_helpful_y = vector(),
        review_helpful_n = vector(),

        pros_lst = vector(),
        cons_lst = vector(),
        previous_device_lst = vector()
        )

        node_review = html_nodes(h, paste(‘li.bv-content-item.bv-content-top-review.bv-content-review:nth-child(‘,i,’)’,sep=””)) #access to ith review#

        #######################main stars#######################
        temp_lst$rating = as.numeric(substr(html_attr(html_nodes(node_review, “abbr.bv-rating-max.bv-rating-stars.bv-rating-stars-on”),”title”),1,1)) #####별점

        #######################sub stars#######################
        secondary_review_length = length(html_nodes(node_review, “dt.bv-content-secondary-ratings-label”)) #repeatedly count sub stars and enter to variable#
        if(secondary_review_length!=0){
        for(k in 1:secondary_review_length){
        label_name = html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label”)[k])

        if(label_name==”Ease of use”){
        temp_lst$ease_of_use = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
        }
        else if(label_name==”Design”){
        temp_lst$design = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
        }
        else if(label_name==”Performance”){
        temp_lst$performance = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
        }
        else if(label_name==”Features”){
        temp_lst$features = as.numeric(substr(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-secondary-ratings-label + dd”)[k])),1,1))
        }
        }
        }

        #######################userID#######################
        temp_lst$user_id = gsub(” “,””,html_text(html_nodes(node_review, “h3.bv-author”)))

        #######################Date#######################
        temp_lst$generated_date = html_text(html_nodes(node_review, “span.bv-content-datetime-stamp”))

        #######################Title#######################
        temp_lst$title = gsub(” “,””,html_text(html_nodes(node_review, “h4.bv-content-title”))) ##change multiple spaces to single space.

        #######################review#######################
        temp_lst$review = html_text(html_nodes(node_review, “p”))

        #######################voting1#######################
        temp_lst$review_helpful_y = as.numeric(gsub(‘ people found this review helpful. Click to agree.’,””,html_text(html_nodes(node_review, “button span.bv-off-screen”))[1]))

        #######################voting2#######################
        temp_lst$review_helpful_n = as.numeric(gsub(‘ people did not find this review helpful. Click to agree.’,””,html_text(html_nodes(node_review, “button span.bv-off-screen”))[2]))

        #######################other extras#######################
        content_data_length = length(html_nodes(node_review, “dt.bv-content-data-label-container”))
        if(content_data_length!=0){
        for(k in 1:content_data_length){
        content_label_name = substring(gsub(” “,””,html_text(html_nodes(node_review, “dt.bv-content-data-label-container”)[k])), 2)

        if(content_label_name == “Yes,”){
        temp_lst$recommendation = TRUE
        }
        else if(content_label_name == “No,”){
        temp_lst$recommendation = FALSE
        }
        else if(content_label_name == “Pros:”){
        temp_lst$pros_lst = gsub(“,”,””,html_text(html_nodes(node_review, “dl.bv-content-data-pros > dd.bv-content-data-value”)))
        }
        else if(content_label_name == “Cons:”){
        temp_lst$cons_lst = gsub(“,”,””,html_text(html_nodes(node_review, “dl.bv-content-data-cons > dd.bv-content-data-value”)))
        }
        else if(content_label_name == “PreviousDevice”){
        temp_lst$previous_device_lst = gsub(“,”,””,html_text(html_nodes(node_review, “dd.bv-content-data-value”)))
        }
        }
        }
        s7_review_lst[[cnt]] <- temp_lst
        cnt = cnt + 1
        }

        #after finish one page click to next page#
        #html_nodes(h,”div.bv-content-list-container”)[1]#
        nextBtn <- chrome$findElement(“css selector”,”span.bv-content-btn-pages-next”)
        #find next button#
        nextBtn$clickElement() #make to click#
        ##Click event#

        },error = function(e) {
        print(‘no button’)
        break #if it is impossible to find next button break#
        })#try catch to handle error#

        }
        length(s7_review_lst)
        s7_review_lst[[]]$rating
        s7_review_lst[[3]]$ease_of_use
        s7_review_lst[[3]]$design
        s7_review_lst[[3]]$performance
        s7_review_lst[[3]]$features
        s7_review_lst[[8005]]$user_id
        s7_review_lst[[3]]$generated_date
        s7_review_lst[[3]]$title
        s7_review_lst[[3]]$review
        s7_review_lst[[3]]$recommendation
        s7_review_lst[[3]]$review_helpful_y
        s7_review_lst[[3]]$review_helpful_n
        s7_review_lst[[3]]$pros_lst
        s7_review_lst[[3]]$cons_lst
        s7_review_lst[[3]]$previous_device_lst

        ##extract reviews#
        review_vector<-vector()
        for(i in 1: 4105){
        review_vector <- c(review_vector,s7_review_lst[[i]]$review)
        }

        ##Saving#
        save(review_vector,file=”verizon_s7_review.Rdata”)
        load(“verizon_s7_review.Rdata”)
        head(review_vector)

        save(s7_review_lst,file=”verizon_s7_all_data.Rdata”)