4 Working with big data in R
4.1 Read in CSV files
4.1.1 Read one large CSV file
4.1.1.1 Read a CSV file with header
##It will take around 3 mins to read in 5,732,838 records with 105 variables
tran2<-fread("tranall2011_19.csv")4.1.1.2 Read a CSV file without header
tran2<-fread("tranall2011_19.csv",header=F)4.1.1.3 Read a CSV file with the second row as header and dropping the first row
epcdata1<-fread("D_EPC_data_2012_Q4_extract_0221.csv", skip = 1)
epcdata14<-read.csv("D_EPC_data_2020_Q4_extract_0221.csv", skip = 1)4.1.2 Fast reading multiple EPC csv files together in R
4.1.2.1 Code for reading in EPCs in England and Wales
## assume all the unzipped EPC stored in EPC folder in D drive
setwd("D:/EPC")
x1 <- list.files(path = ".", pattern = NULL, all.files = FALSE,
full.names = FALSE, recursive = FALSE)
datalist <- paste("D:/EPC",x1,"certificates.csv",sep="/")
epcdata = data.table::rbindlist(lapply(datalist, data.table::fread, showProgress = FALSE))4.1.2.2 Code for reading in EPCs in Scotland
datalist = list.files(pattern="*.csv")
epcdata = data.table::rbindlist(lapply(datalist, data.table::fread, skip=1,showProgress = FALSE))4.2 Basic larger dataset munging/wrangling
4.2.1 Select columns
class(tran2)## [1] "data.table" "data.frame"
needlist<-c("transactionid","postcode","price","dateoftransfer","propertytype","laua","lad11nm","tfarea","priceper","TRANSACTION_TYPE")
tran2<-tran2[,..needlist]
head(tran2)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {5F2B8B60-B9D0-4F00-8561-8BBF0C991BE1} KT22 7LN 187250 2014-07-11 F E07000210 Mole Valley 46 4070.652
## 2: {5F54B81D-B8BA-2B45-E053-6B04A8C01FB0} RH4 3QR 394000 2017-10-23 T E07000210 Mole Valley 77 5116.883
## 3: {F5CB911D-B4EF-4F74-A696-1131D46A1019} KT21 2LJ 447000 2013-10-24 S E07000210 Mole Valley 139 3215.827
## 4: {47844C80-5285-8986-E050-A8C063056488} KT22 7TQ 700700 2017-01-23 D E07000210 Mole Valley 148 4734.459
## 5: {666758D7-4CCD-3363-E053-6B04A8C0D74E} KT21 2DJ 350000 2018-01-03 F E07000210 Mole Valley 63 5555.556
## 6: {288DCE29-B9CB-E510-E050-A8C06205480E} KT22 7DX 135000 2015-10-20 F E07000210 Mole Valley 44 3068.182
## TRANSACTION_TYPE
## 1: rental (private)
## 2: marketed sale
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## 6: marketed sale
4.2.2 Changing column names to lower case or upper case
4.2.2.1 Changing column names to lower case
setnames(tran2, tolower(names(tran2)))
head(tran2)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {5F2B8B60-B9D0-4F00-8561-8BBF0C991BE1} KT22 7LN 187250 2014-07-11 F E07000210 Mole Valley 46 4070.652
## 2: {5F54B81D-B8BA-2B45-E053-6B04A8C01FB0} RH4 3QR 394000 2017-10-23 T E07000210 Mole Valley 77 5116.883
## 3: {F5CB911D-B4EF-4F74-A696-1131D46A1019} KT21 2LJ 447000 2013-10-24 S E07000210 Mole Valley 139 3215.827
## 4: {47844C80-5285-8986-E050-A8C063056488} KT22 7TQ 700700 2017-01-23 D E07000210 Mole Valley 148 4734.459
## 5: {666758D7-4CCD-3363-E053-6B04A8C0D74E} KT21 2DJ 350000 2018-01-03 F E07000210 Mole Valley 63 5555.556
## 6: {288DCE29-B9CB-E510-E050-A8C06205480E} KT22 7DX 135000 2015-10-20 F E07000210 Mole Valley 44 3068.182
## transaction_type
## 1: rental (private)
## 2: marketed sale
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## 6: marketed sale
4.2.2.2 Changing column names to upper case
setnames(tran2, toupper(names(tran2)))4.2.3 Filter rows based on conditions
tran2[laua=="E09000007", ]## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## ---
## 13775: {13521672-6024-48B3-BACE-B2CCD6280078} NW8 7ED 5610000 2015-07-17 F E09000007 Camden 240.00 23375.000
## 13776: {ED4810A9-BC16-49FF-9C6B-844F5E569704} NW8 7ED 6700000 2015-07-01 F E09000007 Camden 287.00 23344.948
## 13777: {EEDA2760-FB8A-4D56-96B2-A0934B38D6BB} NW8 7ED 9494000 2015-07-01 F E09000007 Camden 313.00 30332.268
## 13778: {773788C2-F00D-2CE4-E053-6C04A8C05E57} NW8 7ED 4000000 2018-09-07 F E09000007 Camden 145.00 27586.207
## 13779: {25EA59F9-FEAC-4D50-E050-A8C0630562D0} NW8 7ED 3725000 2015-09-30 F E09000007 Camden 145.00 25689.655
## transaction_type
## 1: marketed sale
## 2: rental (private)
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## ---
## 13775: marketed sale
## 13776: marketed sale
## 13777: marketed sale
## 13778: marketed sale
## 13779: marketed sale
Camden<-tran2[laua=="E09000007", ]
head(Camden)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## transaction_type
## 1: marketed sale
## 2: rental (private)
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## 6: marketed sale
4.2.4 Add in the ID column
Camden[,tranid := .I]
head(Camden)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## transaction_type tranid
## 1: marketed sale 1
## 2: rental (private) 2
## 3: marketed sale 3
## 4: marketed sale 4
## 5: marketed sale 5
## 6: marketed sale 6
#Camden[, tranid := .I+1000000]4.2.5 Convert datatable values to uppercase
Camden[, `:=`(tran_type = toupper(transaction_type))]
head(Camden)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## transaction_type tranid tran_type
## 1: marketed sale 1 MARKETED SALE
## 2: rental (private) 2 RENTAL (PRIVATE)
## 3: marketed sale 3 MARKETED SALE
## 4: marketed sale 4 MARKETED SALE
## 5: marketed sale 5 MARKETED SALE
## 6: marketed sale 6 MARKETED SALE
4.2.6 Delete a column
Camden[,transaction_type:=NULL]
head(Camden)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## tranid tran_type
## 1: 1 MARKETED SALE
## 2: 2 RENTAL (PRIVATE)
## 3: 3 MARKETED SALE
## 4: 4 MARKETED SALE
## 5: 5 MARKETED SALE
## 6: 6 MARKETED SALE
4.2.7 Remove Duplicates
dim(Camden)## [1] 13779 11
unique(Camden)## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## ---
## 13775: {13521672-6024-48B3-BACE-B2CCD6280078} NW8 7ED 5610000 2015-07-17 F E09000007 Camden 240.00 23375.000
## 13776: {ED4810A9-BC16-49FF-9C6B-844F5E569704} NW8 7ED 6700000 2015-07-01 F E09000007 Camden 287.00 23344.948
## 13777: {EEDA2760-FB8A-4D56-96B2-A0934B38D6BB} NW8 7ED 9494000 2015-07-01 F E09000007 Camden 313.00 30332.268
## 13778: {773788C2-F00D-2CE4-E053-6C04A8C05E57} NW8 7ED 4000000 2018-09-07 F E09000007 Camden 145.00 27586.207
## 13779: {25EA59F9-FEAC-4D50-E050-A8C0630562D0} NW8 7ED 3725000 2015-09-30 F E09000007 Camden 145.00 25689.655
## tranid tran_type
## 1: 1 MARKETED SALE
## 2: 2 RENTAL (PRIVATE)
## 3: 3 MARKETED SALE
## 4: 4 MARKETED SALE
## 5: 5 MARKETED SALE
## ---
## 13775: 13775 MARKETED SALE
## 13776: 13776 MARKETED SALE
## 13777: 13777 MARKETED SALE
## 13778: 13778 MARKETED SALE
## 13779: 13779 MARKETED SALE
dim(Camden)## [1] 13779 11
4.2.8 Write files
fwrite(Camden,"Camden.csv")4.2.9 Bind datasets
# Bind by names
class(epcdata1)## [1] "data.table" "data.frame"
class(epcdata14)## [1] "data.frame"
# Convert data.frame to data.table
setDT(epcdata14)
# Select columns
needlist<- c("BUILDING_REFERENCE_NUMBER","OSG_REFERENCE_NUMBER","ADDRESS1","ADDRESS2","ADDRESS3","POSTCODE","INSPECTION_DATE","LODGEMENT_DATE","PROPERTY_TYPE","TYPE_OF_ASSESSMENT","TRANSACTION_TYPE","TOTAL_FLOOR_AREA","NUMBER_HABITABLE_ROOMS","CURRENT_ENERGY_EFFICIENCY","POTENTIAL_ENERGY_EFFICIENCY")
epcdata1<-epcdata1[,..needlist]
epcdata14<-epcdata14[,..needlist]
# Bind by names
l = list(epcdata1,epcdata14)
epc<- rbindlist(l, use.names=TRUE)
# Remove Duplicates
dim(epc)## [1] 61640 15
unique(epc)## BUILDING_REFERENCE_NUMBER OSG_REFERENCE_NUMBER ADDRESS1 ADDRESS2 ADDRESS3 POSTCODE
## 1: 1001856748 118173704 21 CERES PLACE MOTHERWELL ML1 3WE
## 2: 1234570127 119007574 FLAT 1-1 1 Castle Mansions GOUROCK PA19 1PE
## 3: 1234570183 126006147 21 NORTH MIDDLETON DRIVE LARGS KA30 9JW
## 4: 1001220426 136024183 43 SKAITHMUIR CRESCENT FALKIRK FK2 8BP
## 5: 1234570199 200000000000 17 Lambie Street Bathgate EH47 0HH
## ---
## 61636: 1001582626 9051120275 FLAT 1 ALBURY VIEW FONTHILL ROAD ABERDEEN AB11 6TE
## 61637: 1001029287 9051116958 FLAT 4 35 CHESTNUT ROW ABERDEEN AB25 3SE
## 61638: 1000902315 9051116437 FLAT 4 63 ROSEMOUNT PLACE ABERDEEN AB25 2XL
## 61639: 1002090505 130142732 CORRAN SHIELDAIG STRATHCARRON IV54 8XN
## 61640: 1000232247 906700000000 FLAT 3/2 980 ARGYLE STREET GLASGOW G3 8LU
## INSPECTION_DATE LODGEMENT_DATE PROPERTY_TYPE TYPE_OF_ASSESSMENT TRANSACTION_TYPE TOTAL_FLOOR_AREA
## 1: 28/09/2012 01/10/2012 Flat RdSAP, existing dwelling marketed sale 72
## 2: 28/09/2012 01/10/2012 Bungalow RdSAP, existing dwelling marketed sale 61
## 3: 28/09/2012 01/10/2012 House RdSAP, existing dwelling marketed sale 110
## 4: 19/09/2012 01/10/2012 House RdSAP, existing dwelling rental 98
## 5: 28/09/2012 01/10/2012 House RdSAP, existing dwelling rental 78
## ---
## 61636: 30/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 58
## 61637: 30/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 37
## 61638: 30/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 43
## 61639: 30/12/2020 31/12/2020 House RdSAP, existing dwelling none of the above 156
## 61640: 24/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 62
## NUMBER_HABITABLE_ROOMS CURRENT_ENERGY_EFFICIENCY POTENTIAL_ENERGY_EFFICIENCY
## 1: 3 77 79
## 2: 3 64 90
## 3: 5 60 80
## 4: 5 70 84
## 5: 3 50 82
## ---
## 61636: 4 73 77
## 61637: 2 72 77
## 61638: 2 59 75
## 61639: 7 27 83
## 61640: 3 56 78
dim(epc)## [1] 61640 15
4.3 Work with PostGIS database in R
4.3.1 Write files to PostGIS
# Loads the PostgreSQL driver
drv <- dbDriver("PostgreSQL")
# Creates a connection to the casa postGIS databas
con <- dbConnect(drv, dbname = "casa",port=5432, user="postgres",password=******)
# Write Camden to the database
dbWriteTable(con, "Camden",value=Camden, append = TRUE, row.names = FALSE)
# Delete some objects from workspace
rm(Camden,tran2,epc,epcdata1,epcdata14,epcdata)4.3.2 Read files from PostGIS
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "casa",port=5432, user="postgres",password=******)
# load the data from PostGIS database
tran<- dbGetQuery(con,"select * from Camden") 4.4 Measure code performance
4.4.1 Measure running time of the code
class(epc)## [1] "data.table" "data.frame"
# Delete the epc object
rm(epc)
# Bind by names
start_time <- Sys.time()
epc<-rbindlist(list(epcdata1,epcdata14),use.names=TRUE)
end_time <- Sys.time()
end_time - start_time## Time difference of 0.007995844 secs
# Convert data.table to data.frame
setDF(epcdata1)
setDF(epcdata14)
# Combine two data frames by rows
start_time <- Sys.time()
epcdata<-rbind(epcdata1,epcdata14)
end_time <- Sys.time()
end_time - start_time## Time difference of 0.01399207 secs
# Convert data.frame to data.table
setDT(epcdata14)
setDT(epcdata1)
# Combine two data tables by position
start_time <- Sys.time()
epc<-rbindlist(list(epcdata1,epcdata14))
end_time <- Sys.time()
end_time - start_time## Time difference of 0.008995056 secs
4.4.2 profvis- an interactive profile visualizations
profvis({
uniqueresult <- function(x){
dt <- as.data.table(x)
esummary<-dt[,.(count=.N),by=epcdataid]
idd1 <- esummary[esummary$count==1,]
result1 <- x[x$epcdataid %in% idd1$epcdataid,]
return(result1)
}
function1<- function(x,y){
x<-x[is.na(x$saotext),]
x<-x[is.na(x$subbuildingname),]
x$bnstreet <- paste(x$buildingnumber,x$streetdescription,sep=", ")
x$bnstreet <- gsub(" ", "", x$bnstreet)
x$addressf <- paste(x$postcodelocator,x$bnstreet,sep=", ")
y$addressfinal <- trimws(y$add)
y$addressfinal <- gsub(" ", "", y$addressfinal)
y$addressf <- paste(y$postcode,y$addressfinal,sep=", ")
taba1 <- inner_join(x,y,by="addressf")
return(taba1)
}
link1<-function1(add,epc)
link1u<- uniqueresult(link1)
})
4.5 Execute R code in Alteryx
- Step 1: build a workflow in Alteryx
- Step 2: create R code in the Alteryx
- Step 3: Click the use AMP engine and run the workflow
- Step 4: Finish
Notes: If you have any questions about Alteryx, Steven Zhang at Billigence would like to help.