# The subtee subbuild function

#### 2019-01-07

In addition to the modelling functions, the package includes the subbuild function that may be useful when defining the subgroup covariates to use in the analysis.

## Obtaining the data

We use the prca data that was used in Rosenkranz (2016) https://onlinelibrary.wiley.com/doi/abs/10.1002/bimj.201500147

library(subtee)
################################################################################
# The data comes from a clinical trial of an prostate cancer treatment
# Data is loaded from Royston, Patrick, and Willi Sauerbrei. Multivariable model-building: a pragmatic approach to regression anaylsis based on fractional polynomials for modelling continuous variables. Vol. 777. John Wiley & Sons, 2008. https://www.imbi.uni-freiburg.de/Royston-Sauerbrei-book
data_url = "https://www.imbi.uni-freiburg.de/imbi/Royston-Sauerbrei-book/Multivariable_Model-building/downloads/datasets/adv_prostate_ca.zip"
temp <- tempfile()
download.file(data_url, temp, cacheOK = FALSE)
prca = read.csv(unz(temp, "adv_prostate_ca/adv_prostate_ca.csv"))
names(prca) = toupper(names(prca))

## Building subgroups

The subbuild function basically creates binary subgroup indicator variables. For example, if we need to create the subgroup indicator for the group of subjects older than 65 years old, we simply specify this expression in the function

subgroups <- subbuild(data = prca, AGE > 65)
head(subgroups)
#>   AGE > 65
#> 1        1
#> 2        1
#> 3        1
#> 4        1
#> 5        1
#> 6        1

When a continuous covariate is given with no cutoff, the functions will then create n.cuts + 1 subgroups with approximately equal sample sizes.

subgroups <- subbuild(data = prca, AGE, n.cuts = 4)
head(subgroups)
#>   AGE<=68 68<AGE<=72 72<AGE<=74 74<AGE<=76 AGE>76
#> 1       0          0          0          1      0
#> 2       0          1          0          0      0
#> 3       0          0          0          1      0
#> 4       1          0          0          0      0
#> 5       0          1          0          0      0
#> 6       0          0          0          1      0

The indicator variable that the subject had bone metastasis at baseline, BM, contains only 0s and 1s but it is possible to create the indicator using subbuild.

subgroups <- subbuild(data = prca, BM == 1)
head(subgroups)
#>   BM == 1
#> 1       0
#> 2       0
#> 3       0
#> 4       0
#> 5       0
#> 6       0

Doing this may be useful for consistency as the subbuild may take several expressions to define all the candidate subgroups to be analysed at once.

cand.groups <- subbuild(prca,
BM == 1, PF == 1, HX == 1,
STAGE == 4, AGE > 65, WT > 100)
head(cand.groups)
#>   BM == 1 PF == 1 HX == 1 STAGE == 4 AGE > 65 WT > 100
#> 1       0       0       0          0        1        0
#> 2       0       0       1          0        1        1
#> 3       0       1       1          0        1        0
#> 4       0       0       0          0        1        0
#> 5       0       0       0          0        1        0
#> 6       0       0       0          0        1        0

If no expressions are given subbuild generates the binary subgroup indicators based on all covariates in the data set (here restrict to columns 2 to 7) and default settings

cand.groups <- subbuild(prca[,2:7])
head(cand.groups)
#>   AGE<=71 71<AGE<=75 AGE>75 WT<=93 93<WT<=103 WT>103 SBP<=13 13<SBP<=15
#> 1       0          1      0      1          0      0       0          1
#> 2       1          0      0      0          1      0       0          1
#> 3       0          1      0      0          1      0       0          1
#> 4       1          0      0      0          1      0       0          0
#> 5       1          0      0      0          1      0       0          0
#> 6       0          1      0      0          1      0       0          1
#>   SBP>15 DBP<=8 8<DBP<=9 DBP>9 SZ<=7 7<SZ<=17 SZ>17 AP<=6 6<AP<=15 AP>15
#> 1      0      0        1     0     1        0     0     1        0     0
#> 2      0      1        0     0     1        0     0     1        0     0
#> 3      0      1        0     0     1        0     0     0        1     0
#> 4      1      0        0     1     0        0     1     1        0     0
#> 5      1      0        0     1     0        1     0     1        0     0
#> 6      0      0        0     1     0        1     0     0        1     0

Equivalent to the above statement, subgroup indicators are created for the named covariates based on default settings

cand.groups <- subbuild(prca, AGE, WT, SBP, DBP, SZ, AP)
head(cand.groups)
#>   AGE<=71 71<AGE<=75 AGE>75 WT<=93 93<WT<=103 WT>103 SBP<=13 13<SBP<=15
#> 1       0          1      0      1          0      0       0          1
#> 2       1          0      0      0          1      0       0          1
#> 3       0          1      0      0          1      0       0          1
#> 4       1          0      0      0          1      0       0          0
#> 5       1          0      0      0          1      0       0          0
#> 6       0          1      0      0          1      0       0          1
#>   SBP>15 DBP<=8 8<DBP<=9 DBP>9 SZ<=7 7<SZ<=17 SZ>17 AP<=6 6<AP<=15 AP>15
#> 1      0      0        1     0     1        0     0     1        0     0
#> 2      0      1        0     0     1        0     0     1        0     0
#> 3      0      1        0     0     1        0     0     0        1     0
#> 4      1      0        0     1     0        0     1     1        0     0
#> 5      1      0        0     1     0        1     0     1        0     0
#> 6      0      0        0     1     0        1     0     0        1     0

The matrix with all the candidate subgroups will still need to be concatenated with the original data.frame (or at least the response and treatment variables) to be used in the fitting functions unadj, modav, and bagged.

fitdat <- cbind(prca[, c("SURVTIME", "CENS", "RX")], cand.groups)
head(fitdat)
#>   SURVTIME CENS RX AGE<=71 71<AGE<=75 AGE>75 WT<=93 93<WT<=103 WT>103
#> 1     72.5    0  0       0          1      0      1          0      0
#> 2     40.5    1  1       1          0      0      0          1      0
#> 3     20.5    1  0       0          1      0      0          1      0
#> 4     65.5    0  0       1          0      0      0          1      0
#> 5     24.5    1  0       1          0      0      0          1      0
#> 6     46.5    1  0       0          1      0      0          1      0
#>   SBP<=13 13<SBP<=15 SBP>15 DBP<=8 8<DBP<=9 DBP>9 SZ<=7 7<SZ<=17 SZ>17
#> 1       0          1      0      0        1     0     1        0     0
#> 2       0          1      0      1        0     0     1        0     0
#> 3       0          1      0      1        0     0     1        0     0
#> 4       0          0      1      0        0     1     0        0     1
#> 5       0          0      1      0        0     1     0        1     0
#> 6       0          1      0      0        0     1     0        1     0
#>   AP<=6 6<AP<=15 AP>15
#> 1     1        0     0
#> 2     1        0     0
#> 3     0        1     0
#> 4     1        0     0
#> 5     1        0     0
#> 6     0        1     0

Note that the names for the subgroup defining variables are not standard R names. This can be modified using the option make.valid.names = TRUE.

cand.groups <- subbuild(prca,
BM == 1, PF == 1, HX == 1,
STAGE == 4, AGE > 65, WT > 100, make.valid.names = TRUE)
head(cand.groups)
#>   BM.1 PF.1 HX.1 STAGE.4 AGE.g.65 WT.g.100
#> 1    0    0    0       0        1        0
#> 2    0    0    1       0        1        1
#> 3    0    1    1       0        1        0
#> 4    0    0    0       0        1        0
#> 5    0    0    0       0        1        0
#> 6    0    0    0       0        1        0

However, the fitting functions in the package allow to use expressions as variable names and this will lead to more informative plots and summary tables.