Introduction to csdb

library(data.table)
library(magrittr)

Overview

csdb provides an abstracted database-access layer for the Core Surveillance (csverse) ecosystem. The package exposes two R6 classes:

  • DBConnection_v9 — wraps a single ODBC database connection and manages its lifecycle (connect, disconnect, reconnect).
  • DBTable_v9 — represents one database table and provides methods for inserting, upserting, and deleting rows, as well as managing indexes and validating field types and contents.

Connection credentials are read from environment variables, keeping them out of scripts and version control.

DBConnection_v9

DBConnection_v9$new() stores connection parameters but does not open a connection immediately. Calling $connect() opens the connection; $disconnect() closes it. The $autoconnection field returns an active connection, reconnecting automatically if needed.

The example below creates a connection object, connects, inspects the connection fields, then disconnects. After disconnecting, $connection returns an invalid external pointer, while class() still reports the driver type.

DBTable_v9

DBTable_v9$new() takes a dbconfig list (the same parameters as DBConnection_v9), plus table_name, field_types, keys, indexes, and validators. Keys define the primary key used for upsert operations. Validators can enforce field-type and field-content constraints; validator_field_types_blank and validator_field_contents_blank skip validation entirely.

The example below creates a table object backed by a live PostgreSQL database, clears any existing rows, inserts the bundled nor_covid19_cases_by_time_location dataset, connects, and then returns a lazy tbl() reference via dbplyr.

dbconnection <- csdb::DBConnection_v9$new(
  driver = Sys.getenv("CS9_DBCONFIG_DRIVER"),
  server = Sys.getenv("CS9_DBCONFIG_SERVER"),
  port = as.integer(Sys.getenv("CS9_DBCONFIG_PORT")),
  db = Sys.getenv("CS9_DBCONFIG_DB_ANON"),
  user = Sys.getenv("CS9_DBCONFIG_USER"),
  password = Sys.getenv("CS9_DBCONFIG_PASSWORD"),
  sslmode = Sys.getenv("CS9_DBCONFIG_SSLMODE")
)
dbconnection
#> (disconnected)
#> 
#> Driver:              PostgreSQL Unicode 
#> Server:              db 
#> Port:                5432 
#> DB:                  postgres 
#> User:                yourusername 
#> Password:            ********************* 
#> Trusted connection:  x
dbconnection$connect()

dbconnection$connection
#> <OdbcConnection> yourusername@db
#>   Database: postgres
#>   PostgreSQL Version: 17.0.5
dbconnection$autoconnection
#> <OdbcConnection> yourusername@db
#>   Database: postgres
#>   PostgreSQL Version: 17.0.5
dbconnection
#> (connected)
#> 
#> Driver:              PostgreSQL Unicode 
#> Server:              db 
#> Port:                5432 
#> DB:                  postgres 
#> User:                yourusername 
#> Password:            ********************* 
#> Trusted connection:  x

dbconnection$disconnect()
dbconnection$connection
#> Error: external pointer is not valid
class(dbconnection$connection)
#> [1] "PostgreSQL"
#> attr(,"package")
#> [1] "odbc"
class(dbconnection$autoconnection)
#> [1] "PostgreSQL"
#> attr(,"package")
#> [1] "odbc"

dbtable <- csdb::DBTable_v9$new(
  dbconfig = list(
    driver = Sys.getenv("CS9_DBCONFIG_DRIVER"),
    server = Sys.getenv("CS9_DBCONFIG_SERVER"),
    port = as.integer(Sys.getenv("CS9_DBCONFIG_PORT")),
    db = Sys.getenv("CS9_DBCONFIG_DB_ANON"),
    schema = Sys.getenv("CS9_DBCONFIG_SCHEMA_ANON"),
    user = Sys.getenv("CS9_DBCONFIG_USER"),
    password = Sys.getenv("CS9_DBCONFIG_PASSWORD"),
    sslmode = Sys.getenv("CS9_DBCONFIG_SSLMODE")
  ),
  table_name = "anon_test",
  field_types = c(
      "granularity_time" = "TEXT",
      "granularity_geo" = "TEXT",
      "country_iso3" = "TEXT",
      "location_code" = "TEXT",
      "border" = "INTEGER",
      "age" = "TEXT",
      "sex" = "TEXT",
      "isoyear" = "INTEGER",
      "isoweek" = "INTEGER",
      "isoyearweek" = "TEXT",
      "season" = "TEXT",
      "seasonweek" = "DOUBLE",
      "calyear" = "INTEGER",
      "calmonth" = "INTEGER",
      "calyearmonth" = "TEXT",
      "date" = "DATE",
      "covid19_cases_testdate_n" = "INTEGER",
      "covid19_cases_testdate_pr100000" = "DOUBLE"
    ),
    keys = c(
      "granularity_time",
      "location_code",
      "date",
      "age",
      "sex"
    ),
    indexes = list(
      "ind1" = c("granularity_time", "granularity_geo", "country_iso3", "location_code", "border", "age", "sex", "date", "isoyear", "isoweek", "isoyearweek")
    ),
    validator_field_types = csdb::validator_field_types_blank,
    validator_field_contents = csdb::validator_field_contents_blank
)
dbtable$drop_all_rows()
dbtable$insert_data(csdb::nor_covid19_cases_by_time_location)
dbtable$connect()
dbtable$dbconnection$is_connected()
#> [1] TRUE
dbtable$tbl()
#> # Source:   table<"public"."anon_test"> [?? x 18]
#> # Database: postgres  [yourusername@localhost:/postgres]
#>    granularity_time granularity_geo country_iso3 location_code border age   sex   isoyear isoweek isoyearweek season   seasonweek
#>    <chr>            <chr>           <chr>        <chr>          <int> <chr> <chr>   <int>   <int> <chr>       <chr>         <dbl>
#>  1 day              county          nor          county_nor03    2020 total total    2020       8 2020-08     2019/20…         31
#>  2 day              county          nor          county_nor03    2020 total total    2020       8 2020-08     2019/20…         31
#>  3 day              county          nor          county_nor03    2020 total total    2020       8 2020-08     2019/20…         31
#>  4 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#>  5 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#>  6 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#>  7 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#>  8 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#>  9 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#> 10 day              county          nor          county_nor03    2020 total total    2020       9 2020-09     2019/20…         32
#> # ℹ more rows
#> # ℹ 6 more variables: calyear <int>, calmonth <int>, calyearmonth <chr>, date <date>, covid19_cases_testdate_n <int>,
#> #   covid19_cases_testdate_pr100000 <dbl>