what I have learned from coursera course Getting and Cleaning Data.
Reading Excel Files
read.xlsx {xlsx}
library(xlsx)
mydata = read.xlsx("data.xlsx", sheetIndex = 1, header = TRUE)
Reading specific rows and columns
mydata = read.xlsx("data.xlsx", sheetIndex = 1, header = TRUE, colIndex = 2:3,
rowIndex = 1:4)
Reading XML
library(XML)
fileUrl <- "http://wwww.w3schools.com/xml/simple.xml"
doc <- xmlTreeParse(fileUrl, useInternal = TRUE)
rootNode <- xmlRoot(doc)
xmlName(rootNode)
[1] "breakfast_menu"
names(rootNode)
food food food food food
"food" "food" "food" "food" "food"
Directly access parts of the XML document
rootNode[[1]]
<food>
<name>Belgian Waffles</name>
<price>$5.95</price>
<description>Two of our famous Belgian Waffles with plenty of real maple syrup</description>
<calories>650</calories>
</food>
rootNode[[1]][[1]]
<name>Belgian Waffles</name>
Programatically extract parts of the file
xmlSApply(rootNode, xmlValue)
food
"Belgian Waffles$5.95Two of our famous Belgian Waffles with plenty of real maple syrup650"
food
"Strawberry Belgian Waffles$7.95Light Belgian waffles covered with strawberries and whipped cream900"
food
"Berry-Berry Belgian Waffles$8.95Light Belgian waffles covered with an assortment of fresh berries and whipped cream900"
food
"French Toast$4.50Thick slices made from our homemade sourdough bread600"
food
"Homestyle Breakfast$6.95Two eggs, bacon or sausage, toast, and our ever-popular hash browns950"
Get the names and prices on the menu
xpathSApply(rootNode, "//name", xmlValue)
[1] "Belgian Waffles" "Strawberry Belgian Waffles"
[3] "Berry-Berry Belgian Waffles" "French Toast"
[5] "Homestyle Breakfast"
xpathSApply(rootNode, "//price", xmlValue)
[1] "$5.95" "$7.95" "$8.95" "$4.50" "$6.95"
Extract content by attributes
Use htmlTreeParse when the content is known to be (potentially malformed) HTML.
fileUrl = "http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"
doc <- htmlTreeParse(fileUrl, useInternal = TRUE)
scores <- xpathSApply(doc, "//li[@class='score']", xmlValue)
scores
[1] "49-27" "14-6" "30-9" "23-20" "26-23" "19-17"
[7] "19-16" "24-18" "20-17 OT" "23-20 OT" "19-3" "22-20"
[13] "29-26" "18-16" "41-7" "34-17"
teams <- xpathSApply(doc, "//li[@class='team-name']", xmlValue)
teams
[1] "Denver" "Cleveland" "Houston" "Buffalo" "Miami"
[6] "Green Bay" "Pittsburgh" "Cleveland" "Cincinnati" "Chicago"
[11] "New York" "Pittsburgh" "Minnesota" "Detroit" "New England"
[16] "Cincinnati"
Reading Json
library(jsonlite)
jsonData <- fromJSON("https://api.github.com/users/felixfan/repos")
names(jsonData)
[1] "id" "name" "full_name"
[4] "owner" "private" "html_url"
[7] "description" "fork" "url"
[10] "forks_url" "keys_url" "collaborators_url"
[13] "teams_url" "hooks_url" "issue_events_url"
[16] "events_url" "assignees_url" "branches_url"
[19] "tags_url" "blobs_url" "git_tags_url"
[22] "git_refs_url" "trees_url" "statuses_url"
[25] "languages_url" "stargazers_url" "contributors_url"
[28] "subscribers_url" "subscription_url" "commits_url"
[31] "git_commits_url" "comments_url" "issue_comment_url"
[34] "contents_url" "compare_url" "merges_url"
[37] "archive_url" "downloads_url" "issues_url"
[40] "pulls_url" "milestones_url" "notifications_url"
[43] "labels_url" "releases_url" "created_at"
[46] "updated_at" "pushed_at" "git_url"
[49] "ssh_url" "clone_url" "svn_url"
[52] "homepage" "size" "stargazers_count"
[55] "watchers_count" "language" "has_issues"
[58] "has_downloads" "has_wiki" "forks_count"
[61] "mirror_url" "open_issues_count" "forks"
[64] "open_issues" "watchers" "default_branch"
names(jsonData$owner)
[1] "login" "id" "avatar_url"
[4] "gravatar_id" "url" "html_url"
[7] "followers_url" "following_url" "gists_url"
[10] "starred_url" "subscriptions_url" "organizations_url"
[13] "repos_url" "events_url" "received_events_url"
[16] "type" "site_admin"
jsonData$owner$login
[1] "felixfan" "felixfan" "felixfan" "felixfan" "felixfan" "felixfan"
[7] "felixfan" "felixfan" "felixfan" "felixfan" "felixfan" "felixfan"
[13] "felixfan" "felixfan" "felixfan" "felixfan" "felixfan" "felixfan"
[19] "felixfan" "felixfan" "felixfan" "felixfan" "felixfan" "felixfan"
[25] "felixfan" "felixfan" "felixfan" "felixfan" "felixfan" "felixfan"
jsonData$name
[1] "biojava" "Calculator"
[3] "ChiSquareCalculator" "coursera-android"
[5] "CPPLearning" "datasciencecoursera"
[7] "datasharing" "devtools"
[9] "felixfan.github.io" "FinCal"
[11] "FunPlots" "GEO"
[13] "IntroRandLaTeXforIR" "IPGWAS"
[15] "JavaSwing" "jekyll"
[17] "labs" "libsvm"
[19] "markdown-here" "OG-Platform"
[21] "oneliners" "pages"
[23] "powerAnalysis" "ProgrammingAssignment2"
[25] "PubMedWordcloud" "R-Graphics"
[27] "random-forest" "RGenetics"
[29] "slidify" "styles"
# create and updated date of project of 'FinCal'
jsonData[jsonData$name == "FinCal", c("created_at", "updated_at")]
created_at updated_at
10 2013-09-24T06:25:02Z 2013-12-23T07:31:54Z
# create and updated date of all projects
jsonData[, c("name", "created_at", "updated_at")]
name created_at updated_at
1 biojava 2014-02-25T08:34:06Z 2014-02-25T08:34:07Z
2 Calculator 2014-01-20T06:14:33Z 2014-02-27T09:12:59Z
3 ChiSquareCalculator 2014-02-21T06:52:11Z 2014-02-21T07:32:04Z
4 coursera-android 2014-01-20T14:23:37Z 2014-01-20T14:23:38Z
5 CPPLearning 2013-10-23T06:11:50Z 2013-12-07T15:34:51Z
6 datasciencecoursera 2014-04-08T07:14:57Z 2014-04-08T07:18:50Z
7 datasharing 2014-04-08T07:21:12Z 2014-04-08T07:21:13Z
8 devtools 2013-12-11T05:48:57Z 2013-12-11T05:48:58Z
9 felixfan.github.io 2013-09-18T06:01:49Z 2014-04-15T05:59:25Z
10 FinCal 2013-09-24T06:25:02Z 2013-12-23T07:31:54Z
11 FunPlots 2013-10-11T05:39:17Z 2013-12-07T15:39:42Z
12 GEO 2013-11-12T05:52:33Z 2013-12-07T15:37:25Z
13 IntroRandLaTeXforIR 2013-11-14T02:34:24Z 2013-11-14T02:34:24Z
14 IPGWAS 2014-01-24T03:35:16Z 2014-01-24T03:49:35Z
15 JavaSwing 2014-02-24T07:55:14Z 2014-02-24T08:15:20Z
16 jekyll 2013-09-17T03:39:15Z 2013-09-17T03:39:16Z
17 labs 2014-04-14T08:57:57Z 2014-04-14T08:57:58Z
18 libsvm 2014-03-13T06:33:28Z 2014-03-13T06:33:28Z
19 markdown-here 2014-02-27T09:20:30Z 2014-02-27T09:20:31Z
20 OG-Platform 2013-12-04T03:19:15Z 2013-12-04T03:19:19Z
21 oneliners 2013-10-25T09:03:55Z 2013-10-25T09:03:55Z
22 pages 2013-11-20T15:39:57Z 2013-11-20T15:39:57Z
23 powerAnalysis 2013-10-02T06:23:16Z 2013-10-02T06:24:13Z
24 ProgrammingAssignment2 2014-04-10T06:34:02Z 2014-04-10T06:34:03Z
25 PubMedWordcloud 2013-09-17T02:51:48Z 2014-01-15T04:00:33Z
26 R-Graphics 2014-02-06T06:15:53Z 2014-02-06T06:15:54Z
27 random-forest 2014-03-11T09:48:53Z 2014-03-11T09:48:54Z
28 RGenetics 2014-01-24T07:06:54Z 2014-01-24T07:07:48Z
29 slidify 2013-12-10T03:32:19Z 2013-12-10T03:32:19Z
30 styles 2013-11-14T09:50:36Z 2013-11-14T09:50:37Z