libraries

library("data.table")
library("sqldf")
library("vroom")
library("microbenchmark")
library("ggplot2")
library("dplyr")

looking into a file without loading it

If you presume you have a large file it is nice to get glimpse before loading it completely. ## path to our test file meteoroligcal measurements in south tyrol

path_met = "/mnt/SAT/Workspaces/RosM/Exchange/forPete/OpenDataClim.csv"

get file size

… in mb

file.size(path_met)/1e+6
## [1] 10.03322

read only 1st line to see what data looks like

get one line with fread: comes as data.table

data.table::fread(file = path_met, nrows = 1, keepLeadingZeros = TRUE)
##    V1       Date Year Month   SCODE SCODE2 Sensor        Mean Max Min
## 1:  1 2016-01-01 2016     1 06090SF   0609      N 0.001041667 0.1   0
##         Stdev Sum
## 1: 0.01017058 0.3

get one line with fread: comes as tibble

vroom::vroom(file = path_met, n_max = 1, delim = ",")
## New names:
## * `` -> ...1
## Observations: 1
## Variables: 12
## chr  [3]: SCODE, SCODE2, Sensor
## dbl  [8]: , Year, Month, Mean, Max, Min, Stdev, Sum
## date [1]: Date
## 
## Call `spec()` for a copy-pastable column specification
## Specify the column types with `col_types` to quiet this message
## # A tibble: 1 x 12
##    ...1 Date        Year Month SCODE SCODE2 Sensor    Mean   Max   Min
##   <dbl> <date>     <dbl> <dbl> <chr> <chr>  <chr>    <dbl> <dbl> <dbl>
## 1     1 2016-01-01  2016     1 0609… 0609   N      0.00104   0.1     0
## # … with 2 more variables: Stdev <dbl>, Sum <dbl>

load a subset of the file

get first 10 rows and selection of columns with fread

fread(file = path_met, 
      nrows = 10, 
      select = c("Date", "Year", "Month", "SCODE", "Sensor", "Sum"))
##           Date Year Month   SCODE Sensor Sum
##  1: 2016-01-01 2016     1 06090SF      N 0.3
##  2: 2016-01-01 2016     1 61690SF      N 0.0
##  3: 2016-01-02 2016     1 06090SF      N 0.7
##  4: 2016-01-02 2016     1 61690SF      N 0.0
##  5: 2016-01-03 2016     1 06090SF      N 0.9
##  6: 2016-01-03 2016     1 61690SF      N 2.0
##  7: 2016-01-04 2016     1 06090SF      N 0.0
##  8: 2016-01-04 2016     1 61690SF      N 1.3
##  9: 2016-01-05 2016     1 06090SF      N 0.4
## 10: 2016-01-05 2016     1 61690SF      N 0.0

get first 10 rows and selection of columns with vroom

vroom(file = path_met, n_max = 10, delim = ",",  
      col_select = c("Date", "Year", "Month", "SCODE", "Sensor", "Sum"))
## Observations: 10
## Variables: 6
## chr  [2]: SCODE, Sensor
## dbl  [3]: Year, Month, Sum
## date [1]: Date
## 
## Call `spec()` for a copy-pastable column specification
## Specify the column types with `col_types` to quiet this message
## # A tibble: 10 x 6
##    Date        Year Month SCODE   Sensor   Sum
##    <date>     <dbl> <dbl> <chr>   <chr>  <dbl>
##  1 2016-01-01  2016     1 06090SF N        0.3
##  2 2016-01-01  2016     1 61690SF N        0  
##  3 2016-01-02  2016     1 06090SF N        0.7
##  4 2016-01-02  2016     1 61690SF N        0  
##  5 2016-01-03  2016     1 06090SF N        0.9
##  6 2016-01-03  2016     1 61690SF N        2  
##  7 2016-01-04  2016     1 06090SF N        0  
##  8 2016-01-04  2016     1 61690SF N        1.3
##  9 2016-01-05  2016     1 06090SF N        0.4
## 10 2016-01-05  2016     1 61690SF N        0

sql statements to a file

If you know what you need before reading this is great!
This is about 10000 rows…

sqldf::read.csv.sql(file = path_met, 
                    sql = "select * from file where Year = '2016' ") %>% head()
##     X       Date Year Month     SCODE SCODE2 Sensor        Mean Max Min
## 1 "1" 2016-01-01 2016     1 "06090SF" "0609"    "N" 0.001041667 0.1   0
## 2 "2" 2016-01-01 2016     1 "61690SF" "6169"    "N" 0.000000000 0.0   0
## 3 "3" 2016-01-02 2016     1 "06090SF" "0609"    "N" 0.003139013 0.1   0
## 4 "4" 2016-01-02 2016     1 "61690SF" "6169"    "N" 0.000000000 0.0   0
## 5 "5" 2016-01-03 2016     1 "06090SF" "0609"    "N" 0.003169014 0.1   0
## 6 "6" 2016-01-03 2016     1 "61690SF" "6169"    "N" 0.014084507 0.2   0
##                Stdev Sum
## 1 0.0101705833928307 0.3
## 2                  0 0.0
## 3 0.0174761988186352 0.7
## 4                  0 0.0
## 5 0.0175483066063057 0.9
## 6 0.0422615115809029 2.0

read the file completely

with fread (and see total number of rows)

fread(file = path_met, keepLeadingZeros = TRUE)
##             V1       Date Year Month   SCODE SCODE2 Sensor         Mean
##      1:      1 2016-01-01 2016     1 06090SF   0609      N  0.001041667
##      2:      2 2016-01-01 2016     1 61690SF   6169      N  0.000000000
##      3:      3 2016-01-02 2016     1 06090SF   0609      N  0.003139013
##      4:      4 2016-01-02 2016     1 61690SF   6169      N  0.000000000
##      5:      5 2016-01-03 2016     1 06090SF   0609      N  0.003169014
##     ---                                                                
## 107104: 107104 2019-01-01 2019     1 90000SF   9000     HS 20.900000000
## 107105: 107105 2019-01-01 2019     1 90000SF   9000     LT  1.500000000
## 107106: 107106 2019-01-01 2019     1 91180SF   9118     HS 30.230000000
## 107107: 107107 2019-01-01 2019     1 91180SF   9118     LT -1.017000000
## 107108: 107108 2019-01-01 2019     1 91210WS   9121     LT -7.074000000
##            Max    Min      Stdev    Sum
##      1:  0.100  0.000 0.01017058  0.300
##      2:  0.000  0.000 0.00000000  0.000
##      3:  0.100  0.000 0.01747620  0.700
##      4:  0.000  0.000 0.00000000  0.000
##      5:  0.100  0.000 0.01754831  0.900
##     ---                                
## 107104: 20.900 20.900         NA 20.900
## 107105:  1.500  1.500         NA  1.500
## 107106: 30.230 30.230         NA 30.230
## 107107: -1.017 -1.017         NA -1.017
## 107108: -7.074 -7.074         NA -7.074

with vroom

vroom(file = path_met, delim = ",")
## New names:
## * `` -> ...1
## Observations: 107,108
## Variables: 12
## chr  [3]: SCODE, SCODE2, Sensor
## dbl  [8]: , Year, Month, Mean, Max, Min, Stdev, Sum
## date [1]: Date
## 
## Call `spec()` for a copy-pastable column specification
## Specify the column types with `col_types` to quiet this message
## # A tibble: 107,108 x 12
##     ...1 Date        Year Month SCODE SCODE2 Sensor    Mean   Max   Min
##    <dbl> <date>     <dbl> <dbl> <chr> <chr>  <chr>    <dbl> <dbl> <dbl>
##  1     1 2016-01-01  2016     1 0609… 0609   N      0.00104   0.1     0
##  2     2 2016-01-01  2016     1 6169… 6169   N      0         0       0
##  3     3 2016-01-02  2016     1 0609… 0609   N      0.00314   0.1     0
##  4     4 2016-01-02  2016     1 6169… 6169   N      0         0       0
##  5     5 2016-01-03  2016     1 0609… 0609   N      0.00317   0.1     0
##  6     6 2016-01-03  2016     1 6169… 6169   N      0.0141    0.2     0
##  7     7 2016-01-04  2016     1 0609… 0609   N      0         0       0
##  8     8 2016-01-04  2016     1 6169… 6169   N      0.00903   0.1     0
##  9     9 2016-01-05  2016     1 0609… 0609   N      0.00139   0.1     0
## 10    10 2016-01-05  2016     1 6169… 6169   N      0         0       0
## # … with 107,098 more rows, and 2 more variables: Stdev <dbl>, Sum <dbl>

benchmarking

bm = microbenchmark(read.csv.sql(file = path_met, sql = "select * from file where Year = '2016' "), 
                    fread(file = path_met, nrows = 10000, keepLeadingZeros = TRUE), 
                    fread(file = path_met, keepLeadingZeros = TRUE), 
                    vroom(file = path_met, n_max = 10000, delim = ","),
                    vroom(file = path_met, delim = ","), 
                    times=10L)

Check the timings

autoplot(bm)
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Introduction to data.table

This will only scratch the surface. But there are enough resources on the web to get in deeper.

look at the data.table class

dt_met = fread(file = path_met, keepLeadingZeros = TRUE)
class(dt_met)
## [1] "data.table" "data.frame"

convert a data.frame/tibble to data.table

load as non data.table object

tbl_met = vroom(file = path_met, delim = ",")
## New names:
## * `` -> ...1
## Observations: 107,108
## Variables: 12
## chr  [3]: SCODE, SCODE2, Sensor
## dbl  [8]: , Year, Month, Mean, Max, Min, Stdev, Sum
## date [1]: Date
## 
## Call `spec()` for a copy-pastable column specification
## Specify the column types with `col_types` to quiet this message
class(tbl_met)
## [1] "tbl_df"     "tbl"        "data.frame"

convert to data.table (by reference)

setDT(tbl_met)
class(tbl_met)
## [1] "data.table" "data.frame"

and back to data.frame

setDF(tbl_met)
class(tbl_met)
## [1] "data.frame"

renaming

names(dt_met)
##  [1] "V1"     "Date"   "Year"   "Month"  "SCODE"  "SCODE2" "Sensor"
##  [8] "Mean"   "Max"    "Min"    "Stdev"  "Sum"
dt_met[, names(.SD)] # this syntax will be explained later
##  [1] "V1"     "Date"   "Year"   "Month"  "SCODE"  "SCODE2" "Sensor"
##  [8] "Mean"   "Max"    "Min"    "Stdev"  "Sum"
setnames(x = dt_met, old = "V1", new = "id")
dt_met[, names(.SD)]
##  [1] "id"     "Date"   "Year"   "Month"  "SCODE"  "SCODE2" "Sensor"
##  [8] "Mean"   "Max"    "Min"    "Stdev"  "Sum"

get number of rows

dt_met[, .N]
## [1] 107108
nrow(dt_met)
## [1] 107108

filtering rows

by indexing

dt_met[1:5, ]
##    id       Date Year Month   SCODE SCODE2 Sensor        Mean Max Min
## 1:  1 2016-01-01 2016     1 06090SF   0609      N 0.001041667 0.1   0
## 2:  2 2016-01-01 2016     1 61690SF   6169      N 0.000000000 0.0   0
## 3:  3 2016-01-02 2016     1 06090SF   0609      N 0.003139013 0.1   0
## 4:  4 2016-01-02 2016     1 61690SF   6169      N 0.000000000 0.0   0
## 5:  5 2016-01-03 2016     1 06090SF   0609      N 0.003169014 0.1   0
##         Stdev Sum
## 1: 0.01017058 0.3
## 2: 0.00000000 0.0
## 3: 0.01747620 0.7
## 4: 0.00000000 0.0
## 5: 0.01754831 0.9

by a value

dt_met[Year == 2016]
##           id       Date Year Month   SCODE SCODE2 Sensor         Mean
##     1:     1 2016-01-01 2016     1 06090SF   0609      N  0.001041667
##     2:     2 2016-01-01 2016     1 61690SF   6169      N  0.000000000
##     3:     3 2016-01-02 2016     1 06090SF   0609      N  0.003139013
##     4:     4 2016-01-02 2016     1 61690SF   6169      N  0.000000000
##     5:     5 2016-01-03 2016     1 06090SF   0609      N  0.003169014
##    ---                                                               
## 11379: 11379 2016-12-31 2016    12 90000SF   9000     HS 22.981250000
## 11380: 11380 2016-12-31 2016    12 90000SF   9000     LT  1.577083333
## 11381: 11381 2016-12-31 2016    12 91180SF   9118     HS 11.416250000
## 11382: 11382 2016-12-31 2016    12 91180SF   9118     LT  1.231513889
## 11383: 11383 2016-12-31 2016    12 91210WS   9121     LT -3.564190882
##              Max       Min      Stdev       Sum
##     1:  0.100000  0.000000 0.01017058    0.3000
##     2:  0.000000  0.000000 0.00000000    0.0000
##     3:  0.100000  0.000000 0.01747620    0.7000
##     4:  0.000000  0.000000 0.00000000    0.0000
##     5:  0.100000  0.000000 0.01754831    0.9000
##    ---                                         
## 11379: 24.300000 19.300000 1.47643346 3309.3000
## 11380:  5.200000 -0.400000 1.64927273  227.1000
## 11381: 12.120000 10.840000 0.17823484 1643.9400
## 11382:  3.590000 -1.024000 1.22983739  177.3380
## 11383: -0.776766 -5.059454 1.05334960 -513.2435

by multiple values

dt_met[Year %in% c(2016, 2017)]
##           id       Date Year Month   SCODE SCODE2 Sensor          Mean
##     1:     1 2016-01-01 2016     1 06090SF   0609      N   0.001041667
##     2:     2 2016-01-01 2016     1 61690SF   6169      N   0.000000000
##     3:     3 2016-01-02 2016     1 06090SF   0609      N   0.003139013
##     4:     4 2016-01-02 2016     1 61690SF   6169      N   0.000000000
##     5:     5 2016-01-03 2016     1 06090SF   0609      N   0.003169014
##    ---                                                                
## 51355: 51355 2017-12-31 2017    12 90000SF   9000     HS  56.617361111
## 51356: 51356 2017-12-31 2017    12 90000SF   9000     LT   2.735416667
## 51357: 51357 2017-12-31 2017    12 91180SF   9118     HS 113.933333333
## 51358: 51358 2017-12-31 2017    12 91180SF   9118     LT   0.898055556
## 51359: 51359 2017-12-31 2017    12 91210WS   9121     LT  -2.869423611
##            Max     Min      Stdev       Sum
##     1:   0.100   0.000 0.01017058     0.300
##     2:   0.000   0.000 0.00000000     0.000
##     3:   0.100   0.000 0.01747620     0.700
##     4:   0.000   0.000 0.00000000     0.000
##     5:   0.100   0.000 0.01754831     0.900
##    ---                                     
## 51355:  94.100  48.800 7.17676252  8152.900
## 51356:   8.600  -4.400 3.49630112   393.900
## 51357: 116.000 112.200 1.04620526 16406.400
## 51358:   4.895  -3.934 2.06829005   129.320
## 51359:   1.071  -5.631 2.13702480  -413.197

selecting columns

selecting columns directly

dt_met[, .(Date, Mean)]
##               Date         Mean
##      1: 2016-01-01  0.001041667
##      2: 2016-01-01  0.000000000
##      3: 2016-01-02  0.003139013
##      4: 2016-01-02  0.000000000
##      5: 2016-01-03  0.003169014
##     ---                        
## 107104: 2019-01-01 20.900000000
## 107105: 2019-01-01  1.500000000
## 107106: 2019-01-01 30.230000000
## 107107: 2019-01-01 -1.017000000
## 107108: 2019-01-01 -7.074000000

and with a vector of column names

vars = c("Date", "Year", "Mean")
dt_met[, vars, with = FALSE]
##               Date Year         Mean
##      1: 2016-01-01 2016  0.001041667
##      2: 2016-01-01 2016  0.000000000
##      3: 2016-01-02 2016  0.003139013
##      4: 2016-01-02 2016  0.000000000
##      5: 2016-01-03 2016  0.003169014
##     ---                             
## 107104: 2019-01-01 2019 20.900000000
## 107105: 2019-01-01 2019  1.500000000
## 107106: 2019-01-01 2019 30.230000000
## 107107: 2019-01-01 2019 -1.017000000
## 107108: 2019-01-01 2019 -7.074000000

manipulation on columns

sum of column Sum on whole dataset (silly because sensors are mixed here)

dt_met[, sum(Sum)]
## [1] 137412476

in base r this equals

sum(dt_met$Sum)
## [1] 137412476

sum for all years for precipitation

dt_met[Sensor == "N", sum(Sum)]
## [1] 48766.4

sum for 2016 for precipitation

dt_met[Year == 2016 & Sensor == "N", sum(Sum)]
## [1] 12.3

get a summary for all rows on column Sum

dt_met[, summary(Sum)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -5702.4     0.0   299.6  1282.9  1585.2 39512.7

get a summary only for the year 2016 and the precipitation

dt_met[Year == 2016 & Sensor == "N", summary(Sum)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3417  0.2250  2.5000

add a new column

dt_met[, new_col := rnorm(n = .N)]

set some values to NA

dt_met[sample(x = 1:.N, size = 15, replace = FALSE), new_col := NA]

get count of NA

dt_met[is.na(new_col), .N]
## [1] 15
nrow(dt_met[is.na(new_col)])
## [1] 15

set NAs to numbers

dt_met[is.na(new_col), new_col := 0.1]

sum up two columns (using the by argument is recommended)

dt_met[, mean_sd := sum(Mean, Stdev, na.rm = TRUE), by = 1:dt_met[, .N]]

grouping (doing j by group)

counts per group by one grouping variable

dt_met[, .N, by = Sensor]
##    Sensor     N
## 1:      N 20570
## 2:     LT 68942
## 3:     HS 17596

counts per group by more grouping variable

dt_met[, .N, by = .(Sensor, Year)]
##     Sensor Year     N
##  1:      N 2016    36
##  2:     LT 2016  9012
##  3:     HS 2016  2335
##  4:     HS 2017  7607
##  5:     LT 2017 29772
##  6:      N 2017  2597
##  7:     HS 2018  7633
##  8:     LT 2018 30076
##  9:      N 2018 17888
## 10:     HS 2019    21
## 11:     LT 2019    82
## 12:      N 2019    49

Sum of the column Sum by Sensor

dt_met[, .(mean_Sum = mean(Sum)), by = Sensor]
##    Sensor    mean_Sum
## 1:      N    2.370754
## 2:     LT  662.376684
## 3:     HS 5211.305737

Mean and Sum of the column Sum by Sensor

dt_met[, .(mean_Sum = mean(Sum), 
           sum_Sum = sum(Sum)), by = Sensor]
##    Sensor    mean_Sum    sum_Sum
## 1:      N    2.370754    48766.4
## 2:     LT  662.376684 45665573.4
## 3:     HS 5211.305737 91698135.7

Mean and Sum of the column Sum by Year and Sensor

dt_met[, .(mean_Sum = mean(Sum), 
           sum_Sum = sum(Sum)), by = .(Year, Sensor)]
##     Year Sensor     mean_Sum      sum_Sum
##  1: 2016      N    0.3416667       12.300
##  2: 2016     LT  310.7576570  2800548.005
##  3: 2016     HS 2014.3930851  4703607.854
##  4: 2017     HS 3737.6802240 28432533.464
##  5: 2017     LT  673.3647083 20047414.096
##  6: 2017      N    1.7247593     4479.200
##  7: 2018     HS 7672.0457713 58560725.372
##  8: 2018     LT  758.6646350 22817597.563
##  9: 2018      N    2.4751174    44274.900
## 10: 2019     HS   60.4309524     1269.050
## 11: 2019     LT    0.1672317       13.713
## 12: 2019      N    0.0000000        0.000

compare to dplyr syntax

dt_met %>% group_by(Sensor) %>% summarise(mean_Sum = mean(Sum))
## # A tibble: 3 x 2
##   Sensor mean_Sum
##   <chr>     <dbl>
## 1 HS      5211.  
## 2 LT       662.  
## 3 N          2.37

.SD and .SDcols

.SD are all columns

dt_met[, .SD]
##             id       Date Year Month   SCODE SCODE2 Sensor         Mean
##      1:      1 2016-01-01 2016     1 06090SF   0609      N  0.001041667
##      2:      2 2016-01-01 2016     1 61690SF   6169      N  0.000000000
##      3:      3 2016-01-02 2016     1 06090SF   0609      N  0.003139013
##      4:      4 2016-01-02 2016     1 61690SF   6169      N  0.000000000
##      5:      5 2016-01-03 2016     1 06090SF   0609      N  0.003169014
##     ---                                                                
## 107104: 107104 2019-01-01 2019     1 90000SF   9000     HS 20.900000000
## 107105: 107105 2019-01-01 2019     1 90000SF   9000     LT  1.500000000
## 107106: 107106 2019-01-01 2019     1 91180SF   9118     HS 30.230000000
## 107107: 107107 2019-01-01 2019     1 91180SF   9118     LT -1.017000000
## 107108: 107108 2019-01-01 2019     1 91210WS   9121     LT -7.074000000
##            Max    Min      Stdev    Sum     new_col     mean_sd
##      1:  0.100  0.000 0.01017058  0.300  0.02892497  0.01121225
##      2:  0.000  0.000 0.00000000  0.000  1.67261024  0.00000000
##      3:  0.100  0.000 0.01747620  0.700  0.62964998  0.02061521
##      4:  0.000  0.000 0.00000000  0.000 -0.81541656  0.00000000
##      5:  0.100  0.000 0.01754831  0.900 -0.88383279  0.02071732
##     ---                                                        
## 107104: 20.900 20.900         NA 20.900  0.03509723 20.90000000
## 107105:  1.500  1.500         NA  1.500 -1.26976160  1.50000000
## 107106: 30.230 30.230         NA 30.230  0.02003485 30.23000000
## 107107: -1.017 -1.017         NA -1.017 -0.13307079 -1.01700000
## 107108: -7.074 -7.074         NA -7.074  0.62383133 -7.07400000
identical(dt_met, dt_met[, .SD])
## [1] TRUE

with .SDcols are selected columns

dt_met[, .SD, .SDcols = c("Year", "Sum")]
##         Year    Sum
##      1: 2016  0.300
##      2: 2016  0.000
##      3: 2016  0.700
##      4: 2016  0.000
##      5: 2016  0.900
##     ---            
## 107104: 2019 20.900
## 107105: 2019  1.500
## 107106: 2019 30.230
## 107107: 2019 -1.017
## 107108: 2019 -7.074

manipulating on more columns

This is very powerful!

lapply on .SD - apply function to all columns

lapply on .SD. Would use all columns. But there are some char columns.

# dt_met[, lapply(.SD, sum, na.rm = TRUE)]

lapply on .SDcols - apply function to selected columns

apply sum to selected columns (results in one value per column)

dt_met[, lapply(.SD, sum, na.rm = TRUE), .SDcols = c("Sum", "Mean")]
##          Sum     Mean
## 1: 137412476 959242.7

apply sum to selected columns and grouped by year and sensor

dt_met[, lapply(.SD, sum, na.rm = TRUE), .SDcols = c("Sum", "Mean"), by = .(Year, Sensor)]
##     Year Sensor          Sum         Mean
##  1: 2016      N       12.300 8.035328e-02
##  2: 2016     LT  2800548.005 1.996593e+04
##  3: 2016     HS  4703607.854 3.280765e+04
##  4: 2017     HS 28432533.464 1.984382e+05
##  5: 2017     LT 20047414.096 1.395490e+05
##  6: 2017      N     4479.200 1.563619e+01
##  7: 2018     HS 58560725.372 4.082852e+05
##  8: 2018     LT 22817597.563 1.587444e+05
##  9: 2018      N    44274.900 1.537463e+02
## 10: 2019     HS     1269.050 1.269050e+03
## 11: 2019     LT       13.713 1.371300e+01
## 12: 2019      N        0.000 0.000000e+00

apply custom function to selected columns grouped by year and sensor

dt_met[, lapply(.SD, function(x){sum(x)^2}), .SDcols = c("Sum", "Mean"), by = .(Year, Sensor)]
##     Year Sensor          Sum         Mean
##  1: 2016      N 1.512900e+02 6.456650e-03
##  2: 2016     LT 7.843069e+12 3.986385e+08
##  3: 2016     HS 2.212393e+13 1.076342e+09
##  4: 2017     HS 8.084090e+14 3.937773e+10
##  5: 2017     LT 4.018988e+14 1.947394e+10
##  6: 2017      N 2.006323e+07 2.444905e+02
##  7: 2018     HS 3.429359e+15 1.666968e+11
##  8: 2018     LT 5.206428e+14 2.519979e+10
##  9: 2018      N 1.960267e+09 2.363792e+04
## 10: 2019     HS 1.610488e+06 1.610488e+06
## 11: 2019     LT 1.880464e+02 1.880464e+02
## 12: 2019      N 0.000000e+00 0.000000e+00

joins

some more info here create a data.table to join to dt_met.

dt_coefs = data.table(Year = c(rep(2016, 3), rep(2017, 3), rep(2018, 3), rep(2019, 3)), Sensor = c("N", "LT", "HS"), coeff = rnorm(12))
dt_coefs
##     Year Sensor         coeff
##  1: 2016      N -4.447221e-01
##  2: 2016     LT -1.024672e+00
##  3: 2016     HS -1.043777e-01
##  4: 2017      N  1.135168e+00
##  5: 2017     LT -1.182038e+00
##  6: 2017     HS  6.704746e-01
##  7: 2018      N -2.736748e-01
##  8: 2018     LT  1.168416e+00
##  9: 2018     HS -1.369354e+00
## 10: 2019      N  2.055106e-01
## 11: 2019     LT -4.257064e-05
## 12: 2019     HS  1.040320e+00

join using the merge syntax (as in base r)

merge(dt_met, dt_coefs, by =c("Year", "Sensor"))
##         Year Sensor     id       Date Month   SCODE SCODE2       Mean  Max
##      1: 2016     HS    198 2016-07-15     7 53200SF   5320  1.8213592 35.1
##      2: 2016     HS    201 2016-07-16     7 53200SF   5320  1.0430556  2.0
##      3: 2016     HS    204 2016-07-17     7 53200SF   5320  1.0673611  2.8
##      4: 2016     HS    208 2016-07-18     7 53200SF   5320  0.4423611  2.1
##      5: 2016     HS    212 2016-07-19     7 53200SF   5320 -0.6868056 17.6
##     ---                                                                   
## 107104: 2019      N 107095 2019-01-01     1 85700MS   8570  0.0000000  0.0
## 107105: 2019      N 107097 2019-01-01     1 86600MS   8660  0.0000000  0.0
## 107106: 2019      N 107099 2019-01-01     1 86900MS   8690  0.0000000  0.0
## 107107: 2019      N 107101 2019-01-01     1 88820MS   8882  0.0000000  0.0
## 107108: 2019      N 107103 2019-01-01     1 89190MS   8919  0.0000000  0.0
##          Min     Stdev   Sum    new_col  mean_sd      coeff
##      1: -2.1 5.0128003 187.6  0.9428756 6.834160 -0.1043777
##      2: -0.8 0.5720351 150.2  0.1622654 1.615091 -0.1043777
##      3: -0.8 0.7099522 153.7 -0.2910156 1.777313 -0.1043777
##      4: -2.7 1.1633434  63.7  1.1730190 1.605704 -0.1043777
##      5: -4.2 2.5784000 -98.9  1.1991937 1.891594 -0.1043777
##     ---                                                    
## 107104:  0.0        NA   0.0 -0.5023155 0.000000  0.2055106
## 107105:  0.0        NA   0.0 -0.1824975 0.000000  0.2055106
## 107106:  0.0        NA   0.0 -1.5950813 0.000000  0.2055106
## 107107:  0.0        NA   0.0 -2.6628398 0.000000  0.2055106
## 107108:  0.0        NA   0.0 -0.6258653 0.000000  0.2055106

join using the data.table syntax

setkey(dt_met, Year, Sensor)
key(dt_met)
## [1] "Year"   "Sensor"
setkey(dt_coefs, Year, Sensor)
key(dt_coefs)
## [1] "Year"   "Sensor"
dt_met[dt_coefs]
##             id       Date Year Month   SCODE SCODE2 Sensor       Mean  Max
##      1:    198 2016-07-15 2016     7 53200SF   5320     HS  1.8213592 35.1
##      2:    201 2016-07-16 2016     7 53200SF   5320     HS  1.0430556  2.0
##      3:    204 2016-07-17 2016     7 53200SF   5320     HS  1.0673611  2.8
##      4:    208 2016-07-18 2016     7 53200SF   5320     HS  0.4423611  2.1
##      5:    212 2016-07-19 2016     7 53200SF   5320     HS -0.6868056 17.6
##     ---                                                                   
## 107104: 107095 2019-01-01 2019     1 85700MS   8570      N  0.0000000  0.0
## 107105: 107097 2019-01-01 2019     1 86600MS   8660      N  0.0000000  0.0
## 107106: 107099 2019-01-01 2019     1 86900MS   8690      N  0.0000000  0.0
## 107107: 107101 2019-01-01 2019     1 88820MS   8882      N  0.0000000  0.0
## 107108: 107103 2019-01-01 2019     1 89190MS   8919      N  0.0000000  0.0
##          Min     Stdev   Sum    new_col  mean_sd      coeff
##      1: -2.1 5.0128003 187.6  0.9428756 6.834160 -0.1043777
##      2: -0.8 0.5720351 150.2  0.1622654 1.615091 -0.1043777
##      3: -0.8 0.7099522 153.7 -0.2910156 1.777313 -0.1043777
##      4: -2.7 1.1633434  63.7  1.1730190 1.605704 -0.1043777
##      5: -4.2 2.5784000 -98.9  1.1991937 1.891594 -0.1043777
##     ---                                                    
## 107104:  0.0        NA   0.0 -0.5023155 0.000000  0.2055106
## 107105:  0.0        NA   0.0 -0.1824975 0.000000  0.2055106
## 107106:  0.0        NA   0.0 -1.5950813 0.000000  0.2055106
## 107107:  0.0        NA   0.0 -2.6628398 0.000000  0.2055106
## 107108:  0.0        NA   0.0 -0.6258653 0.000000  0.2055106

Chaining

Outlook

  • transform a raster to a data.table
  • some benchmarking on larger datasets: here