Creating datapipelines with R
By Danny Mawani Holmgaard
#load libraries
#library(tidyverse)
#library(dbi)
#library(odbc)
#set up connection object
con <- DBI::dbConnect(odbc::odbc(),
Driver = "SQL Server",
Server = "yourserver.database.windows.net",
Database = "databasename",
UID = "userid",
PWD = "password",
Port = 1234)
#extract data
dataset <- as_tibble(
tbl(con, "dataset")%>%
head(100) #Get the first 100 rows
)
#library(RCurl)
syntax <- "ftp://"
ftpHost <- "ftp.yourserver.com.com"
ftpUser <- "username"
ftpPass <- "password"
folder <- "/folder/" #delete folder if in root
ftpURL <- paste(syntax,ftpHost,folder,sep = "")
#Download files function
download <- function(file){
fileDownload <- file
downloadFtpUrl <- paste(ftpURL,fileDownload, sep = "")
downloadFtpUrlCredentials <- paste(ftpUser,":",ftpPass, sep = "")
bin <- getBinaryURL(downloadFtpUrl,userpwd=downloadFtpUrlCredentials)
con <- file(fileDownload, open = "wb")
writeBin(bin, con)
close(con)}
#Download files
download("yourfile.csv")
#library('RPostgreSQL')
#create connection object
con <- dbConnect(drv ="RPostgreSQL",
user="",
password="",
host="",
port=1234,
dbname="")
#extract data
dataset <- as_tibble(
tbl(con, "dataset")%>%
head(100) #Get the first 100 rows
)
It is difficult to download Facebook and instagram data
Seek a third party vendor instead
Github is the closest thing you get to a “dropbox” for your code.
It allows version control and makes sure that your code is always updated
From github, you can push you code into other systems and work on the same projects
Cloud storage lets you upload and pull files in a secure environment
It has great API’s and can sync directly with bigQuery
Thanks to Mark Edmonson, we can also source R code directly from there
steps:
- name: gcr.io/cloud-builders/gsutil
args: ["-m", "rsync", "-r", "-c", "-d", ".", "gs://yourfolderincloudstorage/ifneededthencreateasubfolder"]
#install.package("googleCloudStorageR") if you have not installed it yet
#load the library
library(googleCloudStorageR)
#Authenticate
gcs_auth()
googleCloudStorageR::gcs_source('yoursubfolderifyouhaveone/yourscript.R', bucket = 'yourcreatedbucket')
# You need to authenticate with your GCP account before being able to do it
gce_vm("yourmachinename", project ="gar-creds-185213", zone = "europe-west1-b",
predefined_type = "g1-small",
template = "rstudio",
username = "username",
password = "password")
#set machine to be launched
library(googleComputeEngineR)
library(googleCloudStorageR)
library(googleAuthR)
gar_auth("/home/username/.httr-oauth")
GCE_AUTH_FILE="/home/username/auth.json"
GCE_DEFAULT_PROJECT_ID="projectname"
GCE_DEFAULT_ZONE="europe-west1-b"
gcs_global_bucket("bucketname")
BQ_AUTH_FILE="/home/username/bq.oauth"
vm <- gce_vm("yourvirtualmachine")
vm <- gce_ssh_setup(vm,
username = "username",
key.pub = "/home/username/.ssh/id_rsa.pub",
key.private = "/home/username/.ssh/id_rsa")
runme <- "Rscript -e \"googleAuthR::gar_gce_auth();googleCloudStorageR::gcs_source('cloudstoragefolder/script.r', bucket = 'bucket')\""
docker_cmd(vm,
cmd = "exec",
args = c("rstudio", runme),
wait = TRUE,
capture_text = FALSE)
#library(bigQueryR)
# First Create the dataset
# bqr_create_table(projectId = "your project id",
# datasetId = "dataset", "your table", your dataframe,
# timePartitioning = FALSE, expirationMs = 0L)
bqr_upload_data(projectId = "your project id",
datasetId = "dataset", "your table", yourdataframe,
overwrite = FALSE, #True to overwrite your table
wait = TRUE, autodetect = FALSE,
maxBadRecords = 1000)
This is not the most stable way to do things, but the easiest
There is so many ways you can work with making your data flow
Start small and build your way up from there
@dannymawani (Twitter / Linkedin)