4 Working with big data in R
4.1 Read in CSV files
4.1.1 Read one large CSV file
4.1.1.1 Read a CSV file with header
##It will take around 3 mins to read in 5,732,838 records with 105 variables
<-fread("tranall2011_19.csv") tran2
4.1.1.2 Read a CSV file without header
<-fread("tranall2011_19.csv",header=F) tran2
4.1.1.3 Read a CSV file with the second row as header and dropping the first row
<-fread("D_EPC_data_2012_Q4_extract_0221.csv", skip = 1)
epcdata1<-read.csv("D_EPC_data_2020_Q4_extract_0221.csv", skip = 1) epcdata14
4.1.2 Fast reading multiple EPC csv files together in R
4.1.2.1 Code for reading in EPCs in England and Wales
## assume all the unzipped EPC stored in EPC folder in D drive
setwd("D:/EPC")
<- list.files(path = ".", pattern = NULL, all.files = FALSE,
x1 full.names = FALSE, recursive = FALSE)
<- paste("D:/EPC",x1,"certificates.csv",sep="/")
datalist
= data.table::rbindlist(lapply(datalist, data.table::fread, showProgress = FALSE)) epcdata
4.1.2.2 Code for reading in EPCs in Scotland
= list.files(pattern="*.csv")
datalist
= data.table::rbindlist(lapply(datalist, data.table::fread, skip=1,showProgress = FALSE)) epcdata
4.2 Basic larger dataset munging/wrangling
4.2.1 Select columns
class(tran2)
## [1] "data.table" "data.frame"
<-c("transactionid","postcode","price","dateoftransfer","propertytype","laua","lad11nm","tfarea","priceper","TRANSACTION_TYPE")
needlist
<-tran2[,..needlist]
tran2
head(tran2)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {5F2B8B60-B9D0-4F00-8561-8BBF0C991BE1} KT22 7LN 187250 2014-07-11 F E07000210 Mole Valley 46 4070.652
## 2: {5F54B81D-B8BA-2B45-E053-6B04A8C01FB0} RH4 3QR 394000 2017-10-23 T E07000210 Mole Valley 77 5116.883
## 3: {F5CB911D-B4EF-4F74-A696-1131D46A1019} KT21 2LJ 447000 2013-10-24 S E07000210 Mole Valley 139 3215.827
## 4: {47844C80-5285-8986-E050-A8C063056488} KT22 7TQ 700700 2017-01-23 D E07000210 Mole Valley 148 4734.459
## 5: {666758D7-4CCD-3363-E053-6B04A8C0D74E} KT21 2DJ 350000 2018-01-03 F E07000210 Mole Valley 63 5555.556
## 6: {288DCE29-B9CB-E510-E050-A8C06205480E} KT22 7DX 135000 2015-10-20 F E07000210 Mole Valley 44 3068.182
## TRANSACTION_TYPE
## 1: rental (private)
## 2: marketed sale
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## 6: marketed sale
4.2.2 Changing column names to lower case or upper case
4.2.2.1 Changing column names to lower case
setnames(tran2, tolower(names(tran2)))
head(tran2)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {5F2B8B60-B9D0-4F00-8561-8BBF0C991BE1} KT22 7LN 187250 2014-07-11 F E07000210 Mole Valley 46 4070.652
## 2: {5F54B81D-B8BA-2B45-E053-6B04A8C01FB0} RH4 3QR 394000 2017-10-23 T E07000210 Mole Valley 77 5116.883
## 3: {F5CB911D-B4EF-4F74-A696-1131D46A1019} KT21 2LJ 447000 2013-10-24 S E07000210 Mole Valley 139 3215.827
## 4: {47844C80-5285-8986-E050-A8C063056488} KT22 7TQ 700700 2017-01-23 D E07000210 Mole Valley 148 4734.459
## 5: {666758D7-4CCD-3363-E053-6B04A8C0D74E} KT21 2DJ 350000 2018-01-03 F E07000210 Mole Valley 63 5555.556
## 6: {288DCE29-B9CB-E510-E050-A8C06205480E} KT22 7DX 135000 2015-10-20 F E07000210 Mole Valley 44 3068.182
## transaction_type
## 1: rental (private)
## 2: marketed sale
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## 6: marketed sale
4.2.2.2 Changing column names to upper case
setnames(tran2, toupper(names(tran2)))
4.2.3 Filter rows based on conditions
=="E09000007", ] tran2[laua
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## ---
## 13775: {13521672-6024-48B3-BACE-B2CCD6280078} NW8 7ED 5610000 2015-07-17 F E09000007 Camden 240.00 23375.000
## 13776: {ED4810A9-BC16-49FF-9C6B-844F5E569704} NW8 7ED 6700000 2015-07-01 F E09000007 Camden 287.00 23344.948
## 13777: {EEDA2760-FB8A-4D56-96B2-A0934B38D6BB} NW8 7ED 9494000 2015-07-01 F E09000007 Camden 313.00 30332.268
## 13778: {773788C2-F00D-2CE4-E053-6C04A8C05E57} NW8 7ED 4000000 2018-09-07 F E09000007 Camden 145.00 27586.207
## 13779: {25EA59F9-FEAC-4D50-E050-A8C0630562D0} NW8 7ED 3725000 2015-09-30 F E09000007 Camden 145.00 25689.655
## transaction_type
## 1: marketed sale
## 2: rental (private)
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## ---
## 13775: marketed sale
## 13776: marketed sale
## 13777: marketed sale
## 13778: marketed sale
## 13779: marketed sale
<-tran2[laua=="E09000007", ]
Camdenhead(Camden)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## transaction_type
## 1: marketed sale
## 2: rental (private)
## 3: marketed sale
## 4: marketed sale
## 5: marketed sale
## 6: marketed sale
4.2.4 Add in the ID column
:= .I]
Camden[,tranid head(Camden)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## transaction_type tranid
## 1: marketed sale 1
## 2: rental (private) 2
## 3: marketed sale 3
## 4: marketed sale 4
## 5: marketed sale 5
## 6: marketed sale 6
#Camden[, tranid := .I+1000000]
4.2.5 Convert datatable values to uppercase
`:=`(tran_type = toupper(transaction_type))]
Camden[, head(Camden)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## transaction_type tranid tran_type
## 1: marketed sale 1 MARKETED SALE
## 2: rental (private) 2 RENTAL (PRIVATE)
## 3: marketed sale 3 MARKETED SALE
## 4: marketed sale 4 MARKETED SALE
## 5: marketed sale 5 MARKETED SALE
## 6: marketed sale 6 MARKETED SALE
4.2.6 Delete a column
:=NULL]
Camden[,transaction_type
head(Camden)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## 6: {AFE46158-BA70-4FAF-999D-70D94F0322F9} WC1H 0LP 255000 2012-11-23 F E09000007 Camden 26.00 9807.692
## tranid tran_type
## 1: 1 MARKETED SALE
## 2: 2 RENTAL (PRIVATE)
## 3: 3 MARKETED SALE
## 4: 4 MARKETED SALE
## 5: 5 MARKETED SALE
## 6: 6 MARKETED SALE
4.2.7 Remove Duplicates
dim(Camden)
## [1] 13779 11
unique(Camden)
## transactionid postcode price dateoftransfer propertytype laua lad11nm tfarea priceper
## 1: {CF95606A-595A-48F5-92CB-708D24CAFC58} NW3 7NL 1956000 2014-10-07 T E09000007 Camden 175.00 11177.143
## 2: {4EA5C667-79EF-4FF4-839D-5344F6F46842} NW2 2BT 3599000 2012-11-05 D E09000007 Camden 402.00 8952.736
## 3: {7EE393AB-3710-4D5D-AE60-7260ADFDEE54} NW3 7NL 1310000 2011-05-17 T E09000007 Camden 193.00 6787.565
## 4: {7E86B6FB-16FC-458C-E053-6B04A8C0C84C} NW3 7NL 2100000 2018-11-16 T E09000007 Camden 193.00 10880.829
## 5: {E33626A9-DCD2-4D65-AF98-33D8BC99D1FD} NW3 5PT 685000 2011-04-20 F E09000007 Camden 80.46 8513.547
## ---
## 13775: {13521672-6024-48B3-BACE-B2CCD6280078} NW8 7ED 5610000 2015-07-17 F E09000007 Camden 240.00 23375.000
## 13776: {ED4810A9-BC16-49FF-9C6B-844F5E569704} NW8 7ED 6700000 2015-07-01 F E09000007 Camden 287.00 23344.948
## 13777: {EEDA2760-FB8A-4D56-96B2-A0934B38D6BB} NW8 7ED 9494000 2015-07-01 F E09000007 Camden 313.00 30332.268
## 13778: {773788C2-F00D-2CE4-E053-6C04A8C05E57} NW8 7ED 4000000 2018-09-07 F E09000007 Camden 145.00 27586.207
## 13779: {25EA59F9-FEAC-4D50-E050-A8C0630562D0} NW8 7ED 3725000 2015-09-30 F E09000007 Camden 145.00 25689.655
## tranid tran_type
## 1: 1 MARKETED SALE
## 2: 2 RENTAL (PRIVATE)
## 3: 3 MARKETED SALE
## 4: 4 MARKETED SALE
## 5: 5 MARKETED SALE
## ---
## 13775: 13775 MARKETED SALE
## 13776: 13776 MARKETED SALE
## 13777: 13777 MARKETED SALE
## 13778: 13778 MARKETED SALE
## 13779: 13779 MARKETED SALE
dim(Camden)
## [1] 13779 11
4.2.8 Write files
fwrite(Camden,"Camden.csv")
4.2.9 Bind datasets
# Bind by names
class(epcdata1)
## [1] "data.table" "data.frame"
class(epcdata14)
## [1] "data.frame"
# Convert data.frame to data.table
setDT(epcdata14)
# Select columns
<- c("BUILDING_REFERENCE_NUMBER","OSG_REFERENCE_NUMBER","ADDRESS1","ADDRESS2","ADDRESS3","POSTCODE","INSPECTION_DATE","LODGEMENT_DATE","PROPERTY_TYPE","TYPE_OF_ASSESSMENT","TRANSACTION_TYPE","TOTAL_FLOOR_AREA","NUMBER_HABITABLE_ROOMS","CURRENT_ENERGY_EFFICIENCY","POTENTIAL_ENERGY_EFFICIENCY")
needlist
<-epcdata1[,..needlist]
epcdata1<-epcdata14[,..needlist]
epcdata14
# Bind by names
= list(epcdata1,epcdata14)
l <- rbindlist(l, use.names=TRUE)
epc
# Remove Duplicates
dim(epc)
## [1] 61640 15
unique(epc)
## BUILDING_REFERENCE_NUMBER OSG_REFERENCE_NUMBER ADDRESS1 ADDRESS2 ADDRESS3 POSTCODE
## 1: 1001856748 118173704 21 CERES PLACE MOTHERWELL ML1 3WE
## 2: 1234570127 119007574 FLAT 1-1 1 Castle Mansions GOUROCK PA19 1PE
## 3: 1234570183 126006147 21 NORTH MIDDLETON DRIVE LARGS KA30 9JW
## 4: 1001220426 136024183 43 SKAITHMUIR CRESCENT FALKIRK FK2 8BP
## 5: 1234570199 200000000000 17 Lambie Street Bathgate EH47 0HH
## ---
## 61636: 1001582626 9051120275 FLAT 1 ALBURY VIEW FONTHILL ROAD ABERDEEN AB11 6TE
## 61637: 1001029287 9051116958 FLAT 4 35 CHESTNUT ROW ABERDEEN AB25 3SE
## 61638: 1000902315 9051116437 FLAT 4 63 ROSEMOUNT PLACE ABERDEEN AB25 2XL
## 61639: 1002090505 130142732 CORRAN SHIELDAIG STRATHCARRON IV54 8XN
## 61640: 1000232247 906700000000 FLAT 3/2 980 ARGYLE STREET GLASGOW G3 8LU
## INSPECTION_DATE LODGEMENT_DATE PROPERTY_TYPE TYPE_OF_ASSESSMENT TRANSACTION_TYPE TOTAL_FLOOR_AREA
## 1: 28/09/2012 01/10/2012 Flat RdSAP, existing dwelling marketed sale 72
## 2: 28/09/2012 01/10/2012 Bungalow RdSAP, existing dwelling marketed sale 61
## 3: 28/09/2012 01/10/2012 House RdSAP, existing dwelling marketed sale 110
## 4: 19/09/2012 01/10/2012 House RdSAP, existing dwelling rental 98
## 5: 28/09/2012 01/10/2012 House RdSAP, existing dwelling rental 78
## ---
## 61636: 30/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 58
## 61637: 30/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 37
## 61638: 30/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 43
## 61639: 30/12/2020 31/12/2020 House RdSAP, existing dwelling none of the above 156
## 61640: 24/12/2020 31/12/2020 Flat RdSAP, existing dwelling rental 62
## NUMBER_HABITABLE_ROOMS CURRENT_ENERGY_EFFICIENCY POTENTIAL_ENERGY_EFFICIENCY
## 1: 3 77 79
## 2: 3 64 90
## 3: 5 60 80
## 4: 5 70 84
## 5: 3 50 82
## ---
## 61636: 4 73 77
## 61637: 2 72 77
## 61638: 2 59 75
## 61639: 7 27 83
## 61640: 3 56 78
dim(epc)
## [1] 61640 15
4.3 Work with PostGIS database in R
4.3.1 Write files to PostGIS
# Loads the PostgreSQL driver
<- dbDriver("PostgreSQL")
drv # Creates a connection to the casa postGIS databas
<- dbConnect(drv, dbname = "casa",port=5432, user="postgres",password=******)
con
# Write Camden to the database
dbWriteTable(con, "Camden",value=Camden, append = TRUE, row.names = FALSE)
# Delete some objects from workspace
rm(Camden,tran2,epc,epcdata1,epcdata14,epcdata)
4.3.2 Read files from PostGIS
<- dbDriver("PostgreSQL")
drv <- dbConnect(drv, dbname = "casa",port=5432, user="postgres",password=******)
con
# load the data from PostGIS database
<- dbGetQuery(con,"select * from Camden") tran
4.4 Measure code performance
4.4.1 Measure running time of the code
class(epc)
## [1] "data.table" "data.frame"
# Delete the epc object
rm(epc)
# Bind by names
<- Sys.time()
start_time <-rbindlist(list(epcdata1,epcdata14),use.names=TRUE)
epc<- Sys.time()
end_time
- start_time end_time
## Time difference of 0.007995844 secs
# Convert data.table to data.frame
setDF(epcdata1)
setDF(epcdata14)
# Combine two data frames by rows
<- Sys.time()
start_time <-rbind(epcdata1,epcdata14)
epcdata<- Sys.time()
end_time - start_time end_time
## Time difference of 0.01399207 secs
# Convert data.frame to data.table
setDT(epcdata14)
setDT(epcdata1)
# Combine two data tables by position
<- Sys.time()
start_time <-rbindlist(list(epcdata1,epcdata14))
epc<- Sys.time()
end_time
- start_time end_time
## Time difference of 0.008995056 secs
4.4.2 profvis- an interactive profile visualizations
profvis({
<- function(x){
uniqueresult <- as.data.table(x)
dt <-dt[,.(count=.N),by=epcdataid]
esummary<- esummary[esummary$count==1,]
idd1 <- x[x$epcdataid %in% idd1$epcdataid,]
result1 return(result1)
}<- function(x,y){
function1<-x[is.na(x$saotext),]
x<-x[is.na(x$subbuildingname),]
x$bnstreet <- paste(x$buildingnumber,x$streetdescription,sep=", ")
x$bnstreet <- gsub(" ", "", x$bnstreet)
x$addressf <- paste(x$postcodelocator,x$bnstreet,sep=", ")
x
$addressfinal <- trimws(y$add)
y$addressfinal <- gsub(" ", "", y$addressfinal)
y$addressf <- paste(y$postcode,y$addressfinal,sep=", ")
y
<- inner_join(x,y,by="addressf")
taba1 return(taba1)
}
<-function1(add,epc)
link1
<- uniqueresult(link1)
link1u
})
4.5 Execute R code in Alteryx
- Step 1: build a workflow in Alteryx
- Step 2: create R code in the Alteryx
- Step 3: Click the use AMP engine and run the workflow
- Step 4: Finish
Notes: If you have any questions about Alteryx, Steven Zhang at Billigence would like to help.