Skip to content

Commit f857cca

Browse files
Merge pull request #20 from ibm-watson-data-lab/R_ibmos2spark_COS_Support
R Cloud Object Storage Support
2 parents 677b840 + c5eafb8 commit f857cca

File tree

9 files changed

+163
-159
lines changed

9 files changed

+163
-159
lines changed

r/sparkr/DESCRIPTION

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
Package: ibmos2sparkR
22
Title: Loads Object Store data into Softlayer and Bluemix
3-
Version: 0.0.7
3+
Version: 0.0.8
44
Authors@R: person("Jim", "Crozier", email = "[email protected]",
55
role = c("aut", "cre"))
6-
Description: Loads data from Object Store in Softlayer and Bluemix
6+
Description: Loads data from Object Store in Softlayer and Bluemix and ObjectStorage
77
Depends:
88
R (>= 3.1.0)
99
License: Apache
1010
LazyData: true
1111
RoxygenNote: 5.0.1
12-
Imports:
13-
SparkR
12+
Imports:
13+
SparkR

r/sparkr/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export(CloudObjectStorage)
34
export(bluemix)
45
export(softlayer)
6+
exportClasses(CloudObjectStorage)
57
exportClasses(bluemix)
68
exportClasses(softlayer)

r/sparkr/R/osconfig.R

Lines changed: 78 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,24 @@ swifturl = function(name, container_name, object_name){
99

1010

1111
#' sparkcontext is a SparkContext object.
12-
#'
12+
#'
1313
#' name is a string that identifies this configuration. You can
1414
#' use any string you like. This allows you to create
1515
#' multiple configurations to different Object Storage accounts.
1616
#' auth_url, username and password are string credentials for your
1717
#' Softlayer Object Store
18-
#' @export softlayer
18+
#' @export softlayer
1919
#' @exportClass softlayer
2020

2121
softlayer <- setRefClass("softlayer",
2222
fields=list(name="character", container_name="character", object_name="character",
23-
sparkcontext='jobj', auth_url="character",
23+
sparkcontext='jobj', auth_url="character",
2424
tenant = "character", username="character", password="character"),
25-
methods=list(initialize =
25+
methods=list(initialize =
2626
function( sparkcontext, name, auth_url, tenant, username, password,public=FALSE,
27-
swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
27+
swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
2828

29-
.self$name = name
29+
.self$name = name
3030
prefix = paste("fs.swift2d.service" , name, sep =".")
3131
hConf = SparkR:::callJMethod(sparkcontext, "hadoopConfiguration")
3232
SparkR:::callJMethod(hConf, "set", "fs.swift2d.impl", swift2d_driver)
@@ -41,21 +41,21 @@ softlayer <- setRefClass("softlayer",
4141
SparkR:::callJMethod(hConf, "set", paste(prefix, "use.get.auth", sep='.'), "true")
4242
invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "location-aware", sep='.'), FALSE))
4343
SparkR:::callJMethod(hConf, "set", paste(prefix, "password", sep='.'), password)
44-
45-
44+
45+
4646
},
47-
47+
4848
url = function(container_name, object_name){
4949
return(swifturl(name, container_name, object_name))}
5050
)
5151
)
5252

5353

54-
54+
5555
#' sparkcontext: a SparkContext object.
5656
#'
5757
#' credentials: a dictionary with the following required keys:
58-
#'
58+
#'
5959
#' auth_url
6060
#' project_id (or projectId)
6161
#' user_id (or userId)
@@ -73,28 +73,28 @@ softlayer <- setRefClass("softlayer",
7373
#' instances, the values for these credentials can be obtained
7474
#' by clicking on the 'insert to code' link just below a data
7575
#' source.
76-
#' @export bluemix
76+
#' @export bluemix
7777
#' @exportClass bluemix
7878

79-
79+
8080
bluemix <- setRefClass("bluemix",
81-
fields=list(name="character", credentials = "list",
81+
fields=list(name="character", credentials = "list",
8282
sparkcontext='jobj', public = "character"),
83-
methods=list(initialize =
83+
methods=list(initialize =
8484
function(..., sparkcontext, name=NULL, credentials,
85-
public=FALSE,swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
85+
public=FALSE,swift2d_driver='com.ibm.stocator.fs.ObjectStoreFileSystem'){
8686

8787
callSuper(...,credentials=credentials)
88-
88+
8989
if ( is.null(name)) name <<- credentials["name"][[1]]
90-
90+
9191
user_id = try( credentials['user_id'][[1]])
9292
if(class(user_id)=="try-error") user_id = credentials['userId'][[1]]
93-
93+
9494
tenant = try( credentials['project_id'][[1]])
9595
if(class(tenant)=="try-error") tenant = credentials['projectId'][[1]]
96-
97-
.self$name = name
96+
97+
.self$name = name
9898
prefix = paste("fs.swift2d.service" , name, sep =".")
9999
hConf = SparkR:::callJMethod(sparkcontext, "hadoopConfiguration")
100100
SparkR:::callJMethod(hConf, "set", "fs.swift2d.impl", swift2d_driver)
@@ -108,8 +108,63 @@ bluemix <- setRefClass("bluemix",
108108
invisible(SparkR:::callJMethod(hConf, "setBoolean", paste(prefix, "public", sep='.'), public))
109109
#invisible(SparkR:::callJMethod(hConf, "setInt", paste(prefix, "http.port", sep='.'), 8080))
110110
},
111-
111+
112112
url = function( container_name, object_name){
113113
return(swifturl(name, container_name, object_name))}
114114
)
115-
)
115+
)
116+
117+
#' CloudObjectStorage is a class that is designed for IBM cloud object storage (COS)
118+
#' It sets up the hadoop config for COS and provide the final file url.
119+
#'
120+
#' sparkContext: a SparkContext object.
121+
#''
122+
#' credentials: a dictionary with the following required keys:
123+
#' endpoint
124+
#' accessKey
125+
#' secretKey
126+
#'
127+
#' configurationName: string identifies the configurations that has been
128+
#' set.
129+
#' When using this from a IBM Spark service instance that
130+
#' is configured to connect to particular Bluemix object store
131+
#' instances, the values for these credentials can be obtained
132+
#' by clicking on the 'insert to code' link just below a data
133+
#' source.
134+
#' @export CloudObjectStorage
135+
#' @exportClass CloudObjectStorage
136+
CloudObjectStorage <- setRefClass("CloudObjectStorage",
137+
fields=list(configName="character"),
138+
methods=list(
139+
initialize = function(..., sparkContext, credentials, configurationName){
140+
141+
142+
if (is.null(credentials["endpoint"][[1]])) {
143+
stop("Attribute endpoint in credentials is missing!")
144+
}
145+
146+
if (is.null(credentials["accessKey"][[1]])) {
147+
stop("Attribute accessKey in credentials is missing!")
148+
}
149+
150+
if (is.null(credentials["secretKey"][[1]])) {
151+
stop("Attribute secretKey in credentials is missing!")
152+
}
153+
154+
.self$configName = configurationName
155+
prefix = "fs.s3d.service"
156+
hConf = SparkR:::callJMethod(sparkContext, "hadoopConfiguration")
157+
SparkR:::callJMethod(hConf, "set", paste(prefix, "endpoint", sep='.'), credentials['endpoint'][[1]])
158+
SparkR:::callJMethod(hConf, "set", paste(prefix, "access.key", sep='.'), credentials['accessKey'][[1]])
159+
SparkR:::callJMethod(hConf, "set", paste(prefix, "secret.key", sep='.'), credentials['secretKey'][[1]])
160+
},
161+
162+
getConfigName = function() {
163+
return (.self$configName)
164+
},
165+
166+
url = function(bucketName, objectName){
167+
return(paste("s3d://", bucketName, ".service/", objectName, sep = ""))
168+
}
169+
)
170+
)

r/sparkr/README.md

Lines changed: 47 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
11
# ibmos2sparkR
22

3-
The package sets Spark Hadoop configurations for connecting to
3+
The package sets Spark Hadoop configurations for connecting to
44
IBM Bluemix Object Storage and Softlayer Account Object Storage instances. This packages uses the new [stocator](https://github.com/SparkTC/stocator) driver, which implements the `swift2d` protocol, and is availble
5-
on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience).
5+
on the latest IBM Apache Spark Service instances (and through IBM Data Science Experience).
66

7-
Using the `stocator` driver connects your Spark executor nodes directly
7+
Using the `stocator` driver connects your Spark executor nodes directly
88
to your data in object storage.
99
This is an optimized, high-performance method to connect Spark to your data. All IBM Apache Spark kernels
10-
are instantiated with the `stocator` driver in the Spark kernel's classpath.
11-
You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator)
12-
and adding it to your local Apache Spark kernel's classpath.
10+
are instantiated with the `stocator` driver in the Spark kernel's classpath.
11+
You can also run this locally by installing the [stocator driver](https://github.com/SparkTC/stocator)
12+
and adding it to your local Apache Spark kernel's classpath.
1313

1414

1515
This package expects a SparkContext instantiated by SparkR. It has been tested to work with
1616
IBM Spark service in R notebooks on IBM DSX, though it should work with other Spark installations
1717
that utilize the [swift2d/stocator](https://github.com/SparkTC/stocator) protocol.
1818

1919

20-
## Installation
20+
## Installation
2121

2222
library(devtools)
2323
devtools::install_url("https://github.com/ibm-cds-labs/ibmos2spark/archive/<version).zip", subdir= "r/sparkr/")
@@ -27,15 +27,39 @@ where `version` should be a tagged release, such as `0.0.7`. (If you're daring,
2727
## Usage
2828

2929
The usage of this package depends on *from where* your Object Storage instance was created. This package
30-
is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience
31-
(DSX) or from a separate account on IBM Softlayer. The instructions below show how to connect to
32-
either type of instance.
30+
is intended to connect to IBM's Object Storage instances obtained from Bluemix or Data Science Experience
31+
(DSX) or from a separate account on IBM Softlayer. It also supports IBM cloud object storage (COS). The
32+
instructions below show how to connect to either type of instance.
3333

3434
The connection setup is essentially the same. But the difference for you is how you deliver the
3535
credentials. If your Object Storage was created with Bluemix/DSX, with a few clicks on the side-tab
3636
within a DSX Jupyter notebook, you can obtain your account credentials in the form of a list.
3737
If your Object Storage was created with a Softlayer account, each part of the credentials will
38-
be found as text that you can copy and paste into the example code below.
38+
be found as text that you can copy and paste into the example code below.
39+
40+
### Cloud Object Storage
41+
library(ibmos2sparkR)
42+
configurationName = "bluemixO123"
43+
44+
# In DSX notebooks, the "insert to code" will insert this credentials list for you
45+
credentials <- list(
46+
accessKey = "123",
47+
secretKey = "123",
48+
endpoint = "https://s3-api.objectstorage.....net/"
49+
)
50+
51+
cos <- CloudObjectStorage(sparkContext=sc, credentials=credentials, configurationName=configurationName)
52+
bucketName <- "bucketName"
53+
fileName <- "test.csv"
54+
url <- cos$url(bucketName, fileName)
55+
56+
invisible(sparkR.session(appName = "SparkSession R"))
57+
58+
df.data.1 <- read.df(url,
59+
source = "org.apache.spark.sql.execution.datasources.csv.CSVFileFormat",
60+
header = "true")
61+
head(df.data.1)
62+
3963

4064
### Bluemix / Data Science Experience
4165

@@ -45,11 +69,11 @@ be found as text that you can copy and paste into the example code below.
4569
# In DSX notebooks, the "insert to code" will insert this credentials list for you
4670
creds = list(
4771
auth_url="https://identity.open.softlayer.com",
48-
region="dallas",
49-
project_id = "XXXXX",
50-
user_id="XXXXX",
72+
region="dallas",
73+
project_id = "XXXXX",
74+
user_id="XXXXX",
5175
password="XXXXX")
52-
76+
5377
bmconfig = bluemix(sparkcontext=sc, name=configurationname, credentials = creds)
5478

5579
container = "my_container"
@@ -67,24 +91,24 @@ be found as text that you can copy and paste into the example code below.
6791
library(ibmos2sparkR)
6892
configurationname = "softlayerOScon" #can be any any name you like (allows for multiple configurations)
6993

70-
slconfig = softlayer(sparkcontext=sc,
71-
name=configurationname,
94+
slconfig = softlayer(sparkcontext=sc,
95+
name=configurationname,
7296
auth_url="https://identity.open.softlayer.com",
73-
tenant = "XXXXX",
74-
username="XXXXX",
97+
tenant = "XXXXX",
98+
username="XXXXX",
7599
password="XXXXX"
76100
)
77-
101+
78102
container = "my_container"
79103
object = "my_data.csv"
80104

81105
data <- read.df(sqlContext, slconfig$url(container,object), source = "com.databricks.spark.csv", header = "true")
82-
106+
83107
# OR, for Spark >= 2.0.0
84108

85109
data = read.df(slconfig$url(container, objectname), source="com.databricks.spark.csv", header="true")
86-
87-
## License
110+
111+
## License
88112

89113
Copyright 2016 IBM Cloud Data Services
90114

r/sparkr/man/CloudObjectStorage-class.Rd

Lines changed: 26 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/sparkr/man/bluemix-class.Rd

Lines changed: 3 additions & 17 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)