%\VignetteEngine{knitr::knitr}
%\VignetteIndexEntry{tramnet}
%\VignetteDepends{tramnet, lattice, colorspace, penalized, glmnet, mvtnorm, Matrix, kableExtra, coin, trtf, tbm, tram}

\documentclass[a4paper]{report}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{RJournal}
\usepackage{amsfonts,amstext,amsmath,amssymb,amsthm}
\usepackage{array}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}
\newcommand{\xmark}{\ding{55}}


\usepackage{accents}

%%% mlt
%% rv
\newcommand{\rZ}{Z}
\newcommand{\rY}{Y}
\newcommand{\rX}{\mX}
\newcommand{\rS}{\mS}
\newcommand{\rs}{\svec}
\newcommand{\rz}{z}
\newcommand{\ry}{y}
\newcommand{\rx}{\xvec}
\newcommand{\erx}{x}
%% sigma algebra
\newcommand{\sA}{\mathfrak{A}}
\newcommand{\sAZ}{\mathfrak{B}}
\newcommand{\sAY}{\mathfrak{C}}
\newcommand{\esA}{A}
\newcommand{\esAZ}{B}
\newcommand{\esAY}{C}
%% sample spaces
\newcommand{\sam}{\Omega}
\newcommand{\samZ}{\RR}
\newcommand{\samY}{\Xi}
\newcommand{\samX}{\chi}
%% measureable spaces
\newcommand{\ms}{(\sam, \sA)}
\newcommand{\msZ}{(\samZ, \sAZ)}
\newcommand{\msY}{(\samY, \sAY)}
%% probability spaces
\newcommand{\ps}{(\sam, \sA, \Prob)}
\newcommand{\psZ}{(\samZ, \sAZ, \Prob_\rZ)}
\newcommand{\psY}{(\samY, \sAY, \Prob_\rY)}
%% distributions
\newcommand{\pZ}{F_\rZ}
\newcommand{\pY}{F_\rY}
\newcommand{\hatpY}{\hat{F}_{\rY,N}}
\newcommand{\hatpYx}{\hat{F}_{\rY | \rX = \rx, N}}
\newcommand{\pN}{\Phi}
\newcommand{\pSL}{F_{\SL}}
\newcommand{\pMEV}{F_{\MEV}}
\newcommand{\pGumbel}{F_{\text{Gumbel}}}
\newcommand{\pSW}{F_{\SW}}
\newcommand{\pYx}{F_{\rY | \rX = \rx}}
\newcommand{\pYA}{F_{\rY | \rX = A}}
\newcommand{\pYB}{F_{\rY | \rX = B}}
\newcommand{\qZ}{F^{-1}_\rZ}
\newcommand{\qY}{F^{-1}_\rY}
\newcommand{\dZ}{f_\rZ}
\newcommand{\dY}{f_\rY}
\newcommand{\hatdY}{\hat{f}_{\rY, N}}
\newcommand{\dYx}{f_{\rY | \rX = \rx}}
\newcommand{\hazY}{\lambda_{\rY}}
\newcommand{\HazY}{\Lambda_{\rY}}
\newcommand{\hathazY}{\hat{\lambda}_{\rY, N}}
\newcommand{\hatHazY}{\hat{\Lambda}_{\rY, N}}
%% measures
\newcommand{\measureY}{\mu}
\newcommand{\lebesgue}{\mu_L}
\newcommand{\counting}{\mu_C}
%% trafo
\newcommand{\g}{g}
\newcommand{\h}{h}
\newcommand{\s}{\svec}
\newcommand{\hY}{h_\rY}
\newcommand{\hx}{h_\rx}
\newcommand{\hs}{\mathcal{H}}
\newcommand{\basisy}{\avec}
\newcommand{\bern}[1]{\avec_{\text{Bs},#1}}
\newcommand{\bernx}[1]{\bvec_{\text{Bs},#1}}
\newcommand{\basisx}{\bvec}
\newcommand{\basisyx}{\cvec}
\newcommand{\m}{m}
\newcommand{\lik}{\mathcal{L}}
\newcommand{\parm}{\varthetavec}
\newcommand{\eparm}{\vartheta}
\newcommand{\dimparm}{P}
\newcommand{\dimparmx}{Q}
\newcommand{\shiftparm}{\betavec}
\newcommand{\baseparm}{\alphavec}
\newcommand{\eshiftparm}{\beta}

\newcommand{\ie}{\textit{i.e.}~}
\newcommand{\eg}{\textit{e.g.}~}

%\newcommand{\Prob}{\mathbb{P}}
\newcommand{\Ex}{\mathbb{E}}
\newcommand{\RR}{\mathbb{R}}
\newcommand{\eps}{\varepsilon}
\newcommand{\prodname}{tensor }
\newcommand{\Null}{\mathbf{0}}
\newcommand{\FI}{\mF}

\usepackage{dsfont}
\newcommand{\I}{\mathds{1}}


\def \dsP {\text{$\mathds{P}$}}
\def \dsE {\text{$\mathds{E}$}}
\def \dsR {\text{$\mathds{R}$}}
\def \dsN {\text{$\mathds{N}$}}


% Math Operators

 \DeclareMathOperator{\logit}{logit}
 \DeclareMathOperator{\expit}{expit}
 \DeclareMathOperator{\LRT}{LRT}
 \DeclareMathOperator{\RLRT}{RLRT}
 \DeclareMathOperator{\Cov}{Cov}
 \DeclareMathOperator{\Cor}{Cor}
 \DeclareMathOperator{\Var}{Var}
 \DeclareMathOperator{\EW}{\dsE}
 \DeclareMathOperator{\D}{D}
 \DeclareMathOperator{\Bias}{Bias}
 \DeclareMathOperator{\MSE}{MSE}
 \DeclareMathOperator{\PLS}{PLS}
 \DeclareMathOperator{\rank}{rank}
 \DeclareMathOperator{\ncol}{ncol}
 \DeclareMathOperator{\pen}{pen}
 \DeclareMathOperator{\const}{const}
 \DeclareMathOperator{\diag}{diag}
 \DeclareMathOperator{\blockdiag}{blockdiag}
 \DeclareMathOperator{\df}{df}
 \DeclareMathOperator{\trace}{tr}
 \DeclareMathOperator{\iid}{i.i.d.}
 \DeclareMathOperator{\ind}{ind.}
 \DeclareMathOperator{\obs}{obs}
 \DeclareMathOperator{\acos}{acos}
 \DeclareMathOperator{\spat}{spat}
 \DeclareMathOperator{\fix}{{fix}}
 \DeclareMathOperator{\ran}{{ran}}
 \DeclareMathOperator*{\argmin}{{arg\,min}}
 \DeclareMathOperator*{\argmax}{{arg\,max}}
 \DeclareMathOperator{\BIC}{{BIC}}
 \DeclareMathOperator{\DIC}{{DIC}}
 \DeclareMathOperator{\AIC}{{AIC}}
 \DeclareMathOperator{\mAIC}{{mAIC}}
 \DeclareMathOperator{\cAIC}{{cAIC}}

% Distributions

 \DeclareMathOperator{\ND}{N}
 \DeclareMathOperator{\TND}{TN}
 \DeclareMathOperator{\UD}{U}
 \DeclareMathOperator{\GaD}{Ga}
 \DeclareMathOperator{\tD}{t}
 \DeclareMathOperator{\IGD}{IG}
 \DeclareMathOperator{\IWD}{IW}
 \DeclareMathOperator{\PoD}{Po}
 \DeclareMathOperator{\ExpD}{Exp}
 \DeclareMathOperator{\LapD}{Lap}
 \DeclareMathOperator{\MuD}{Mu}
 \DeclareMathOperator{\DirD}{Dir}
 \DeclareMathOperator{\PDD}{PD}
 \DeclareMathOperator{\BeD}{Be}
 \DeclareMathOperator{\BD}{B}
 \DeclareMathOperator{\DPD}{DP}
 \DeclareMathOperator{\KSD}{KS}
 \DeclareMathOperator{\SL}{SL}
 \DeclareMathOperator{\MEV}{MEV}
 \DeclareMathOperator{\SW}{SW}
 \DeclareMathOperator{\Chi1}{\chi^2_1}


% Boldface vectors and matrices

\def \avec {\text{\boldmath$a$}}    \def \mA {\text{\boldmath$A$}}
\def \bvec {\text{\boldmath$b$}}    \def \mB {\text{\boldmath$B$}}
\def \cvec {\text{\boldmath$c$}}    \def \mC {\text{\boldmath$C$}}
\def \dvec {\text{\boldmath$d$}}    \def \mD {\text{\boldmath$D$}}
\def \evec {\text{\boldmath$e$}}    \def \mE {\text{\boldmath$E$}}
\def \fvec {\text{\boldmath$f$}}    \def \mF {\text{\boldmath$F$}}
\def \gvec {\text{\boldmath$g$}}    \def \mG {\text{\boldmath$G$}}
\def \hvec {\text{\boldmath$h$}}    \def \mH {\text{\boldmath$H$}}
\def \ivec {\text{\boldmath$i$}}    \def \mI {\text{\boldmath$I$}}
\def \jvec {\text{\boldmath$j$}}    \def \mJ {\text{\boldmath$J$}}
\def \kvec {\text{\boldmath$k$}}    \def \mK {\text{\boldmath$K$}}
\def \lvec {\text{\boldmath$l$}}    \def \mL {\text{\boldmath$L$}}
\def \mvec {\text{\boldmath$m$}}    \def \mM {\text{\boldmath$M$}}
\def \nvec {\text{\boldmath$n$}}    \def \mN {\text{\boldmath$N$}}
\def \ovec {\text{\boldmath$o$}}    \def \mO {\text{\boldmath$O$}}
\def \pvec {\text{\boldmath$p$}}    \def \mP {\text{\boldmath$P$}}
\def \qvec {\text{\boldmath$q$}}    \def \mQ {\text{\boldmath$Q$}}
\def \rvec {\text{\boldmath$r$}}    \def \mR {\text{\boldmath$R$}}
\def \svec {\text{\boldmath$s$}}    \def \mS {\text{\boldmath$S$}}
\def \tvec {\text{\boldmath$t$}}    \def \mT {\text{\boldmath$T$}}
\def \uvec {\text{\boldmath$u$}}    \def \mU {\text{\boldmath$U$}}
\def \vvec {\text{\boldmath$v$}}    \def \mV {\text{\boldmath$V$}}
\def \wvec {\text{\boldmath$w$}}    \def \mW {\text{\boldmath$W$}}
\def \xvec {\text{\boldmath$x$}}    \def \mX {\text{\boldmath$X$}}
\def \yvec {\text{\boldmath$y$}}    \def \mY {\text{\boldmath$Y$}}
\def \zvec {\text{\boldmath$z$}}    \def \mZ {\text{\boldmath$Z$}}

 \def \calA {\mathcal A}
 \def \calB {\mathcal B}
 \def \calC {\mathcal C}
 \def \calD {\mathcal D}
 \def \calE {\mathcal E}
 \def \calF {\mathcal F}
 \def \calG {\mathcal G}
 \def \calH {\mathcal H}
 \def \calI {\mathcal I}
 \def \calJ {\mathcal J}
 \def \calK {\mathcal K}
 \def \calL {\mathcal L}
 \def \calM {\mathcal M}
 \def \calN {\mathcal N}
 \def \calO {\mathcal O}
 \def \calP {\mathcal P}
 \def \calQ {\mathcal Q}
 \def \calR {\mathcal R}
 \def \calS {\mathcal S}
 \def \calT {\mathcal T}
 \def \calU {\mathcal U}
 \def \calV {\mathcal V}
 \def \calW {\mathcal W}
 \def \calX {\mathcal X}
 \def \calY {\mathcal Y}
 \def \calZ {\mathcal Z}

\def \ahatvec {\text{\boldmath$\hat a$}}    \def \mhatA {\text{\boldmath$\hat A$}}
\def \bhatvec {\text{\boldmath$\hat b$}}    \def \mhatB {\text{\boldmath$\hat B$}}
\def \chatvec {\text{\boldmath$\hat c$}}    \def \mhatC {\text{\boldmath$\hat C$}}
\def \dhatvec {\text{\boldmath$\hat d$}}    \def \mhatD {\text{\boldmath$\hat D$}}
\def \ehatvec {\text{\boldmath$\hat e$}}    \def \mhatE {\text{\boldmath$\hat E$}}
\def \fhatvec {\text{\boldmath$\hat f$}}    \def \mhatF {\text{\boldmath$\hat F$}}
\def \ghatvec {\text{\boldmath$\hat g$}}    \def \mhatG {\text{\boldmath$\hat G$}}
\def \hhatvec {\text{\boldmath$\hat h$}}    \def \mhatH {\text{\boldmath$\hat H$}}
\def \ihatvec {\text{\boldmath$\hat i$}}    \def \mhatI {\text{\boldmath$\hat I$}}
\def \jhatvec {\text{\boldmath$\hat j$}}    \def \mhatJ {\text{\boldmath$\hat J$}}
\def \khatvec {\text{\boldmath$\hat k$}}    \def \mhatK {\text{\boldmath$\hat K$}}
\def \lhatvec {\text{\boldmath$\hat l$}}    \def \mhatL {\text{\boldmath$\hat L$}}
\def \mhatvec {\text{\boldmath$\hat m$}}    \def \mhatM {\text{\boldmath$\hat M$}}
\def \nhatvec {\text{\boldmath$\hat n$}}    \def \mhatN {\text{\boldmath$\hat N$}}
\def \ohatvec {\text{\boldmath$\hat o$}}    \def \mhatO {\text{\boldmath$\hat O$}}
\def \phatvec {\text{\boldmath$\hat p$}}    \def \mhatP {\text{\boldmath$\hat P$}}
\def \qhatvec {\text{\boldmath$\hat q$}}    \def \mhatQ {\text{\boldmath$\hat Q$}}
\def \rhatvec {\text{\boldmath$\hat r$}}    \def \mhatR {\text{\boldmath$\hat R$}}
\def \shatvec {\text{\boldmath$\hat s$}}    \def \mhatS {\text{\boldmath$\hat S$}}
\def \thatvec {\text{\boldmath$\hat t$}}    \def \mhatT {\text{\boldmath$\hat T$}}
\def \uhatvec {\text{\boldmath$\hat u$}}    \def \mhatU {\text{\boldmath$\hat U$}}
\def \vhatvec {\text{\boldmath$\hat v$}}    \def \mhatV {\text{\boldmath$\hat V$}}
\def \whatvec {\text{\boldmath$\hat w$}}    \def \mhatW {\text{\boldmath$\hat W$}}
\def \xhatvec {\text{\boldmath$\hat x$}}    \def \mhatX {\text{\boldmath$\hat X$}}
\def \yhatvec {\text{\boldmath$\hat y$}}    \def \mhatY {\text{\boldmath$\hat Y$}}
\def \zhatvec {\text{\boldmath$\hat z$}}    \def \mhatZ {\text{\boldmath$\hat Z$}}


\def \atildevec {\text{\boldmath$\tilde a$}}    \def \mtildeA {\text{\boldmath$\tilde A$}}
\def \btildevec {\text{\boldmath$\tilde b$}}    \def \mtildeB {\text{\boldmath$\tilde B$}}
\def \ctildevec {\text{\boldmath$\tilde c$}}    \def \mtildeC {\text{\boldmath$\tilde C$}}
\def \dtildevec {\text{\boldmath$\tilde d$}}    \def \mtildeD {\text{\boldmath$\tilde D$}}
\def \etildevec {\text{\boldmath$\tilde e$}}    \def \mtildeE {\text{\boldmath$\tilde E$}}
\def \ftildevec {\text{\boldmath$\tilde f$}}    \def \mtildeF {\text{\boldmath$\tilde F$}}
\def \gtildevec {\text{\boldmath$\tilde g$}}    \def \mtildeG {\text{\boldmath$\tilde G$}}
\def \htildevec {\text{\boldmath$\tilde h$}}    \def \mtildeH {\text{\boldmath$\tilde H$}}
\def \itildevec {\text{\boldmath$\tilde i$}}    \def \mtildeI {\text{\boldmath$\tilde I$}}
\def \jtildevec {\text{\boldmath$\tilde j$}}    \def \mtildeJ {\text{\boldmath$\tilde J$}}
\def \ktildevec {\text{\boldmath$\tilde k$}}    \def \mtildeK {\text{\boldmath$\tilde K$}}
\def \ltildevec {\text{\boldmath$\tilde l$}}    \def \mtildeL {\text{\boldmath$\tilde L$}}
\def \mtildevec {\text{\boldmath$\tilde m$}}    \def \mtildeM {\text{\boldmath$\tilde M$}}
\def \ntildevec {\text{\boldmath$\tilde n$}}    \def \mtildeN {\text{\boldmath$\tilde N$}}
\def \otildevec {\text{\boldmath$\tilde o$}}    \def \mtildeO {\text{\boldmath$\tilde O$}}
\def \ptildevec {\text{\boldmath$\tilde p$}}    \def \mtildeP {\text{\boldmath$\tilde P$}}
\def \qtildevec {\text{\boldmath$\tilde q$}}    \def \mtildeQ {\text{\boldmath$\tilde Q$}}
\def \rtildevec {\text{\boldmath$\tilde r$}}    \def \mtildeR {\text{\boldmath$\tilde R$}}
\def \stildevec {\text{\boldmath$\tilde s$}}    \def \mtildeS {\text{\boldmath$\tilde S$}}
\def \ttildevec {\text{\boldmath$\tilde t$}}    \def \mtildeT {\text{\boldmath$\tilde T$}}
\def \utildevec {\text{\boldmath$\tilde u$}}    \def \mtildeU {\text{\boldmath$\tilde U$}}
\def \vtildevec {\text{\boldmath$\tilde v$}}    \def \mtildeV {\text{\boldmath$\tilde V$}}
\def \wtildevec {\text{\boldmath$\tilde w$}}    \def \mtildeW {\text{\boldmath$\tilde W$}}
\def \xtildevec {\text{\boldmath$\tilde x$}}    \def \mtildeX {\text{\boldmath$\tilde X$}}
\def \ytildevec {\text{\boldmath$\tilde y$}}    \def \mtildeY {\text{\boldmath$\tilde Y$}}
\def \ztildevec {\text{\boldmath$\tilde z$}}    \def \mtildeZ {\text{\boldmath$\tilde Z$}}

\def \alphavec        {\text{\boldmath$\alpha$}}
\def \betavec         {\text{\boldmath$\beta$}}
\def \gammavec        {\text{\boldmath$\gamma$}}
\def \deltavec        {\text{\boldmath$\delta$}}
\def \epsilonvec      {\text{\boldmath$\epsilon$}}
\def \varepsilonvec   {\text{\boldmath$\varepsilon$}}
\def \zetavec         {\text{\boldmath$\zeta$}}
\def \etavec          {\text{\boldmath$\eta$}}
\def \thetavec        {\text{\boldmath$\theta$}}
\def \varthetavec     {\text{\boldmath$\vartheta$}}
\def \iotavec         {\text{\boldmath$\iota$}}
\def \kappavec        {\text{\boldmath$\kappa$}}
\def \lambdavec       {\text{\boldmath$\lambda$}}
\def \muvec           {\text{\boldmath$\mu$}}
\def \nuvec           {\text{\boldmath$\nu$}}
\def \xivec           {\text{\boldmath$\xi$}}
\def \pivec           {\text{\boldmath$\pi$}}
\def \varpivec        {\text{\boldmath$\varpi$}}
\def \rhovec          {\text{\boldmath$\rho$}}
\def \varrhovec       {\text{\boldmath$\varrho$}}
\def \sigmavec        {\text{\boldmath$\sigma$}}
\def \varsigmavec     {\text{\boldmath$\varsigma$}}
\def \tauvec          {\text{\boldmath$\tau$}}
\def \upsilonvec      {\text{\boldmath$\upsilon$}}
\def \phivec          {\text{\boldmath$\phi$}}
\def \varphivec       {\text{\boldmath$\varphi$}}
\def \psivec          {\text{\boldmath$\psi$}}
\def \chivec          {\text{\boldmath$\chi$}}
\def \omegavec        {\text{\boldmath$\omega$}}

\def \alphahatvec        {\text{\boldmath$\hat \alpha$}}
\def \betahatvec         {\text{\boldmath$\hat \beta$}}
\def \gammahatvec        {\text{\boldmath$\hat \gamma$}}
\def \deltahatvec        {\text{\boldmath$\hat \delta$}}
\def \epsilonhatvec      {\text{\boldmath$\hat \epsilon$}}
\def \varepsilonhatvec   {\text{\boldmath$\hat \varepsilon$}}
\def \zetahatvec         {\text{\boldmath$\hat \zeta$}}
\def \etahatvec          {\text{\boldmath$\hat \eta$}}
\def \thetahatvec        {\text{\boldmath$\hat \theta$}}
\def \varthetahatvec     {\text{\boldmath$\hat \vartheta$}}
\def \iotahatvec         {\text{\boldmath$\hat \iota$}}
\def \kappahatvec        {\text{\boldmath$\hat \kappa$}}
\def \lambdahatvec       {\text{\boldmath$\hat \lambda$}}
\def \muhatvec           {\text{\boldmath$\hat \mu$}}
\def \nuhatvec           {\text{\boldmath$\hat \nu$}}
\def \xihatvec           {\text{\boldmath$\hat \xi$}}
\def \pihatvec           {\text{\boldmath$\hat \pi$}}
\def \varpihatvec        {\text{\boldmath$\hat \varpi$}}
\def \rhohatvec          {\text{\boldmath$\hat \rho$}}
\def \varrhohatvec       {\text{\boldmath$\hat \varrho$}}
\def \sigmahatvec        {\text{\boldmath$\hat \sigma$}}
\def \varsigmahatvec     {\text{\boldmath$\hat \varsigma$}}
\def \tauhatvec          {\text{\boldmath$\hat \tau$}}
\def \upsilonhatvec      {\text{\boldmath$\hat \upsilon$}}
\def \phihatvec          {\text{\boldmath$\hat \phi$}}
\def \varphihatvec       {\text{\boldmath$\hat \varphi$}}
\def \psihatvec          {\text{\boldmath$\hat \psi$}}
\def \chihatvec          {\text{\boldmath$\hat \chi$}}
\def \omegahatvec        {\text{\boldmath$\hat \omega$}}

\def \alphatildevec        {\text{\boldmath$\tilde \alpha$}}
\def \betatildevec         {\text{\boldmath$\tilde \beta$}}
\def \gammatildevec        {\text{\boldmath$\tilde \gamma$}}
\def \deltatildevec        {\text{\boldmath$\tilde \delta$}}
\def \epsilontildevec      {\text{\boldmath$\tilde \epsilon$}}
\def \varepsilontildevec   {\text{\boldmath$\tilde \varepsilon$}}
\def \zetatildevec         {\text{\boldmath$\tilde \zeta$}}
\def \etatildevec          {\text{\boldmath$\tilde \eta$}}
\def \thetatildevec        {\text{\boldmath$\tilde \theta$}}
\def \varthetatildevec     {\text{\boldmath$\tilde \vartheta$}}
\def \iotatildevec         {\text{\boldmath$\tilde \iota$}}
\def \kappatildevec        {\text{\boldmath$\tilde \kappa$}}
\def \lambdatildevec       {\text{\boldmath$\tilde \lambda$}}
\def \mutildevec           {\text{\boldmath$\tilde \mu$}}
\def \nutildevec           {\text{\boldmath$\tilde \nu$}}
\def \xitildevec           {\text{\boldmath$\tilde \xi$}}
\def \pitildevec           {\text{\boldmath$\tilde \pi$}}
\def \varpitildevec        {\text{\boldmath$\tilde \varpi$}}
\def \rhotildevec          {\text{\boldmath$\tilde \rho$}}
\def \varrhotildevec       {\text{\boldmath$\tilde \varrho$}}
\def \sigmatildevec        {\text{\boldmath$\tilde \sigma$}}
\def \varsigmatildevec     {\text{\boldmath$\tilde \varsigma$}}
\def \tautildevec          {\text{\boldmath$\tilde \tau$}}
\def \upsilontildevec      {\text{\boldmath$\tilde \upsilon$}}
\def \phitildevec          {\text{\boldmath$\tilde \phi$}}
\def \varphitildevec       {\text{\boldmath$\tilde \varphi$}}
\def \psitildevec          {\text{\boldmath$\tilde \psi$}}
\def \chitildevec          {\text{\boldmath$\tilde \chi$}}
\def \omegatildevec        {\text{\boldmath$\tilde \omega$}}

\def \mGamma   {\mathbf{\Gamma}}
\def \mDelta   {\mathbf{\Delta}}
\def \mTheta   {\mathbf{\Theta}}
\def \mLambda  {\mathbf{\Lambda}}
\def \mXi      {\mathbf{\Xi}}
\def \mPi      {\mathbf{\Pi}}
\def \mSigma   {\mathbf{\Sigma}}
\def \mUpsilon {\mathbf{\Upsilon}}
\def \mPhi     {\mathbf{\Phi}}
\def \mPsi     {\mathbf{\Psi}}
\def \mOmega   {\mathbf{\Omega}}

\def \mhatGamma   {\mathbf{\hat \Gamma}}
\def \mhatDelta   {\mathbf{\hat \Delta}}
\def \mhatTheta   {\mathbf{\hat \Theta}}
\def \mhatLambda  {\mathbf{\hat \Lambda}}
\def \mhatXi      {\mathbf{\hat \Xi}}
\def \mhatPi      {\mathbf{\hat \Pi}}
\def \mhatSigma   {\mathbf{\hat \Sigma}}
\def \mhatUpsilon {\mathbf{\hat \Upsilon}}
\def \mhatPhi     {\mathbf{\hat \Phi}}
\def \mhatPsi     {\mathbf{\hat \Psi}}
\def \mhatOmega   {\mathbf{\hat \Omega}}

\def \nullvec {\mathbf{0}}
\def \onevec {\mathbf{1}}

%%% theorems
\newtheorem{lem}{Lemma}
\newtheorem{thm}{Theorem}
\newtheorem{coro}{Corollary}
\newtheorem{defn}{Definition}
\newtheorem{remark}{Remark}

\newcommand{\ubar}[1]{\underaccent{\bar}{#1}}

\theoremstyle{definition}
\newtheorem{exmpl}{Example}


\newcommand{\cmd}[1]{\texttt{#1()}}
\newcommand{\cls}[1]{\texttt{"#1"}}
\newcommand{\pll}{\tilde\ell(\parm,\shiftparm,\lambda,\alpha;\ry, \rs, \rx)}
\newcommand{\ull}{\ell(\parm,\shiftparm;\ry, \rs, \rx)}
\newcommand{\llpen}{\lambda\left(\alpha \norm{\shiftparm}_1 + \frac{1}{2}
	(1-\alpha) \norm{\shiftparm}_2^2 \right)}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\given}{\rvert}
\newcommand{\Prb}{\mathbb{P}}

<<setup, echo = FALSE, results = "hide", message = FALSE, warning=FALSE>>=
set.seed(241068)

knitr::opts_chunk$set(echo = TRUE, results = 'markup', error = FALSE,
                      warning = FALSE, message = FALSE,
                      tidy = FALSE, cache = TRUE, size = "small",
                      fig.width = 5, fig.height = 4, fig.align = "center",
                      out.width = NULL, ###'.6\\linewidth',
                      out.height = NULL,
                      fig.scap = NA)
knitr::render_sweave()  # use Sweave environments
knitr::set_header(highlight = '')  # do not \usepackage{Sweave}
## R settings
options(prompt = "R> ", continue = "+  ", useFancyQuotes = FALSE)  # JSS style
options(width = 75, digits = 3)

.cmp <- function(x, y) {
  ret <- cbind(x, y, x - y, (x - y) / x)
  colnames(ret) <- c("tram", "tramnet", "diff", "rel_diff")
  return(ret)
}

from <- function(todistr, n, p, n0p, dd, cfx, ord, support, add, seed) {
  yvar <- numeric_var("y", support = support, add = add)
  yB <- Bernstein_basis(yvar, order = ord, ui = "increasing", log_first = FALSE)
  st <- as.basis(as.formula(
    paste("~", paste0("X", seq_len(p), collapse = " + "), "- 1") ), data = dd)
  m <- ctm(response = yB, shifting = st, todistr = todistr)
  coef(m) <- cfx
  return(m)
}

print_mbo <- function(x, ...) {
  op = x$opt.path
  cat("Recommended parameters:\n")
  cat(paramValueToString(op$par.set, x$x), "\n")
  cat("Objective:", op$y.names[1], "=", round(x$y, 3), "\n")
}

# Dependencies
library("tramnet")
pkgs <- c("penalized", "glmnet", "mvtnorm", "Matrix", "coin", "kableExtra",
          "lattice", "colorspace")
av <- !any(!sapply(pkgs, \(x) require(x, character.only = TRUE)))

# Colors
if (av) {
  col <- diverge_hcl(2, h = c(246, 40), c = 96, l = c(65, 90))
  fill <- diverge_hcl(2, h = c(246, 40), c = 96, l = c(65, 90), alpha = .3)
}

@

\begin{document}
%% do not edit, for illustration only
\sectionhead{Contributed research article}
\volume{XX}
\volnumber{YY}
\year{20ZZ}
\month{AAAA}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% replace RJtemplate with your article
\begin{article}

% !TeX root = RJwrapper.tex
\title{Regularized Transformation Models: The \pkg{tramnet} Package}
\author{by Lucas Kook and Torsten Hothorn}

\maketitle

\abstract{
  The \pkg{tramnet} package implements regularized linear
  transformation models by combining the flexible class of transformation
  models from \pkg{tram} with constrained convex optimization implemented in \pkg{CVXR}.
  Regularized transformation models unify many existing and novel regularized regression
  models under one theoretical and computational framework.
  Regularization strategies implemented for transformation models in \pkg{tramnet}
  include the LASSO, ridge regression and the elastic net and
  follow the parametrization in \pkg{glmnet}. Several functionalities
  for optimizing the hyperparameters, including model-based optimization based on
  the \pkg{mlrMBO} package, are implemented. A multitude of \code{S3} methods
  are deployed for visualization, handling and simulation purposes. This work
  aims at illustrating all facets of \pkg{tramnet} in realistic settings and comparing
  regularized transformation models with existing implementations of similar models.
}%%%

%BOF%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}
%-------------------------------------------------------------------------------

A plethora of \textsf{R} packages exist to estimate generalized linear regression
models via penalized maximum likelihood, such as \pkg{penalized}
\citep{penalizedpaper} and \pkg{glmnet} \citep{glmnetpaper}. Both packages come
with an extension to fit a penalized form of the Cox proportional hazard model.
The \pkg{tramnet} package aims at unifying the above mentioned and several
novel models using the theoretical and computational framework of
transformation models. Novel models in this class include Continuous Outcome
Logistic Regression (COLR) as introduced by \citet{COLRpaper} and Box-Cox type
regression models with a transformed conditionally normal response \citep{boxcox,
pkg:tram}.

The disciplined convex optimization package \pkg{CVXR} \citep{CVXRpaper} is
applied to solve the constrained convex optimization problems that arise
when fitting regularized transformation models.  Transformation models are
introduced in Section~\ref{subsec:tram}, for a more theoretical treatise we
refer to \citet{ctmpaper, mltpaper, mltjss}.  Convex optimization and domain
specific languages are briefly discussed in Section~\ref{subsec:CVXR},
followed by a treatment of model-based optimization for hyperparameter
tuning (\ref{subsec:mbo}).

\subsection{Transformation models} \label{subsec:tram}

In stark contrast to penalized generalized linear models, regularized
transformation models aim at estimating the response's whole conditional distribution
instead of focusing on a single moment, \eg the conditional mean.
This conditional distribution function of a response $\rY$ is decomposed into an
\textit{a priori} chosen absolute continuous and log-concave error distribution
$F$ and a conditional transformation function $\h(\ry \given \rx, \rs)$ that depends
on the measured covariates $\rx$ and stratum variables $\rs$ and is monotone
increasing in $\ry$.
% \begin{eqnarray} \label{eq:ctm}
% \begin{aligned}
%   \Prb\left(\rY \le \ry \given \rX = \rx\right) = \pY\left(\rY \given \rX = \rx\right)
% \end{aligned}
% \end{eqnarray}
Although the model class is more flexible,
packages \pkg{tram} and \pkg{tramnet} focus on stratified linear transformation models of the form
% \begin{eqnarray*}
% 	F\left(\h(\ry \given \rx)\right) = F\left(\h(\ry) -
% 	\tilde\rx^\top\shiftparm\right),
% \end{eqnarray*}
% where the covariates introduce linear shift effects $\shiftparm$ from the baseline
% transformation $\h(\ry)$ which now only depends on the response $\ry$. An
% additional layer of complexity, akin to equation \eqref{eq:ctm}, can be introduced
% using stratum variables $\rs$, resulting in transformation models of the form
\begin{eqnarray} \label{fm:trafo}
	\Prb\left(\rY \le \ry \given \rX = \rx, \rS = \rs \right) =
	F\left(\h(\ry \given \rs, \rx)\right) = F\left(\h(\ry \given \rs) -
	\rx^\top\shiftparm\right).
\end{eqnarray}
Here, the baseline transformation is allowed to vary with stratum variables $\rs$,
while covariate effects $\shiftparm$ are restricted to be shifts common to all
baseline transformations $\h(\ry \given \rs)$.

% \subsection{Parametrization of the baseline transformation}

In order for the model to represent a valid cumulative distribution function,
$F\left(\h(\ry \given \rs, \rx)\right)$ has to be monotone increasing in $\ry$ and thus in
$\h$ for all possible strata $\rs$ and all possible configurations of the
covariates $\rx$. To ensure monotonicity, $\h$ is parametrized in terms of a basis
expansion using Bernstein polynomials as implemented in the \pkg{basefun}
package \citep{mltjss}. Hence, $\h$ is of the form
\begin{eqnarray*}
	\h(\ry) = \bern{p}(\ry)^\top \parm,
\end{eqnarray*}
where $\bern{p}(\ry)$ denotes the vector of basis functions in $\ry$ of order $p$
and $\parm$ are the coefficients for each basis function. Conveniently,
$\bern{p}(\ry)^\top \parm$ is monotone increasing in $\ry$ as long as
\begin{eqnarray} \label{eq:const}
\begin{array}{ll}
\eparm_i \leq \eparm_{i+1} & \forall \; i = 0, \dots, p - 1
\end{array}
\end{eqnarray}
holds. For the concrete parameterization of stratified linear transformation
models the reader is referred to \citet{pkg:tram}.

% \subsection{Likelihood contributions} \label{subsec:likelihoods}

Many contemporary models can be understood as linear transformation models,
such as the normal linear regression model, logistic regression for binary,
ordered and continuous responses, as well as exponential, Weibull and Rayleigh
regression and the Cox model in survival analysis. Thus, by appropriately choosing
and parametrizing $F$ and $\h$ one can understand all those models in the same
maximum likelihood-based framework. One can formulate the corresponding likelihood
contributions not only for exact observations, but under any form of random
censoring and truncation for continuous and discrete or ordered categorical
responses.

Given a univariate response $\rY$ and a set of covariates $\rX$ one can specify
the following cumulative distribution function and density valid for any linear
transformation model,
\begin{eqnarray*}
  \begin{aligned}
  	\pYx(\ry \given \rs, \rx) &= F\left(\h(\ry \mid \rs) - \rx^\top \shiftparm \right), \\
  	\dYx(\ry \given \rs, \rx) &= F'\left(\h(\ry \mid \rs) - \rx^\top \shiftparm \right)
  	\cdot \h'(\ry \mid \rs).
  \end{aligned}
\end{eqnarray*}
From here, the log-likelihood contributions for exact, right, left, and interval
censored responses can be derived as
\begin{eqnarray*}
  \begin{aligned}
  	\ell_i(\parm,\shiftparm;\ry_i, \rs_i, \rx_i) &=
  	\left\{
  	\begin{array}{lll}
  	  \log\left(F'\left(\h\left(\ry_i \mid \rs_i \right) - \rx_i^\top \shiftparm
  	  \right)\right) + \log\left(\h'(\ry_i \mid \rs_i)\right) & \ry_i & \text{exact}\\
  	  \log\left(F\left(\h(\bar{\ry} \mid \rs_i) - \rx_i^\top \shiftparm\right)
  	  \right) & \ry_i  \in (-\infty, \bar{y}] & \text{left} \\
  	   \log\left(1 -
  	  F\left(\h(\ubar{\ry} \mid \rs_i) - \rx_i^\top \shiftparm\right)\right) &
  	  \ry_i \in (\ubar{\ry}, \infty) & \text{right} \\
  	  \log\left(F\left(\h(\bar{\ry} \mid \rs_i) - \rx_i^\top \shiftparm\right) -
  	  F\left(\h(\ubar{\ry} \mid \rs_i) - \rx_i^\top \shiftparm\right)\right) &
  	  \ry_i \in (\ubar{\ry}, \bar{\ry}] & \text{interval}.  \\
  	\end{array}
  	\right.
  \end{aligned}
\end{eqnarray*}
The joint log-likelihood of several observations $\ry_1, \dots, \ry_n$ is obtained
by summing over the individual log-likelihood contributions $\ell_i$ under the
assumption that the individual samples are independent and identically
distributed, the case exclusively dealt with by \pkg{tramnet}.

\subsection{Regularization} \label{regul}

The aim of \pkg{tramnet} is to enable the estimation of regularized stratified
linear transformation models. This is achieved by optimizing a penalized
form of the log-likelihood introduced in the last section. The penalized
log-likelihood,
\begin{eqnarray*}
  \pll = \ull - \llpen,
\end{eqnarray*}
consists of the unpenalized log-likelihood and an additional penalty term. Note
that only the shift parameters $\shiftparm$ are penalized, whereas the coefficients
for the baseline transformation $\parm$ remain unpenalized.
The parameterization of the penalty is chosen to be the same as in \pkg{glmnet},
consisting of a global penalization parameter $\lambda$ and a mixing parameter
$\alpha$ controlling the amount of $L_1$ compared to $L_2$ penalization.

The two penalties and any combination thereof have unique properties and may be useful
under different circumstances. A pure $L_1$ penalty was first introduced by \citet{lasso1996}
in an OLS framework and was dubbed the LASSO (Least Absolute
Shrinkage and Selection Operator) due to its property of shrinking
regression coefficients exactly to 0 for large enough $\lambda$. A pure LASSO
penalty can be obtained in a regularized transformation model by specifying
$\alpha = 1$. Applying an $L_2$ penalty in an OLS problem was introduced more than
five decades earlier by \citet{tikhonov1943stability} and later termed ridge
regression \citep{hoerl1970ridge}.
In contrast to LASSO, ridge regression leads to shrunken regression coefficients,
but does not perform automatic variable selection. \citet{zou2005regularization}
picked up on both approaches, discussed their advantages, disadvantages and
overall characteristics and combined them into the elastic net penalty,
a convex combination of an $L_1$ and $L_2$ penatly controlled by the mixing
parameter $\alpha$.
Some of these properties will be illustrated for different models and a real world
data set in sections \ref{subsec:modelclasses} and \ref{sec:tuning}.

\subsection{Constrained convex optimization} \label{subsec:CVXR}

Special algorithms were developed to optimize regularized objective functions,
most prominently the LARS and LARS-EN algorithm \citep{efron2004} and variants
thereof for the penalized Cox model \citep{penalizedpaper}. However, the aim of
\pkg{tramnet} is to solve the objective functions arising in regularized transformation
models in a single computational framework.
Due to the log-concavity of all choices for $F$ in this package and $\h(\ry)$ being
monotone increasing in $\ry$, the resulting log-likelihood contributions for any
form of censoring and truncation are concave and can thus be solved by constrained
convex optimization.

The fairly recent development of \pkg{CVXR} allows the specification of constrained
convex optimization problems in terms of domain-specific language, yielding
an intuitive and highly flexible framework for constrained optimization. Because
checking convexity of an aribitrarily complex expression is extremely hard,
\pkg{CVXR} makes use of a library of smaller expressions, called atoms, with known
monotonicity and curvature and tries to decompose the objective at hand using a
set of rules from disciplined convex optimization (DCP) \citep{DCPpaper}. Thus a
complex expression's curvature can be more easily determined.

More formally, convex optimization aims at solving a problem of the form
\begin{eqnarray*}
  \begin{array}{ll}
  	\underset{\parm}{\mbox{minimize}} & g(\parm) \\
  	\mbox{subject to} & g_i(\parm) \leq 0, \; i = 1, \dots, K \\
  	& \mA\parm = \bvec,
  \end{array}
\end{eqnarray*}
where $\parm \in \mathbb{R}^p$ is the parameter vector, $g(\parm)$ is the objective
function to be optimized, $g_i(\parm)$ specify the inequality constraints and
$\mA \in \mathbb{R}^{n \times p}$ and $\bvec \in \mathbb{R}^p$ parametrize any
equality constraints on $\parm$. Importantly, the objective function and all
inequality constraint functions are convex \citep{boyd2004convex}.

The likelihood $\sum_i \ell_i(\parm, \shiftparm; \ry_i, \rs_i, \rx_i)$ for
transformation models of the form (\ref{fm:trafo}) are convex for error
distributions with log-concave density, because log-convexity
of $-{F'}$ ensures the existence and uniqueness of the most likely transformation
$\hat\h(\ry)$ and the convexity of $-\ell(\h; \ry, \rx)$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{exmpl}[Curvature of the log-density of the Gompertz distribution]
%
% The standard minimum extreme value distribution as defined by its cdf and pdf
% \begin{eqnarray*}
%     \begin{aligned}
%     	F(\rz) &= 1 - \exp\left(-\exp(\rz)\right) \\
%     	{F'}(\rz) &= \exp\left(-\exp(z) + z\right)
%     \end{aligned}
% \end{eqnarray*}
% is log-concave in $\rz$ because the condition $\left(\log{F'}\right)'' < 0$,
% explicitly given by
% \begin{eqnarray*}
%   \begin{aligned}
%      \log{F'} &= -\exp(z) + z \\
%      \left(\log{F'}(\rz)\right)'' &= -\exp(z)
%   \end{aligned}
% \end{eqnarray*}
% holds $\forall\rz\in\mathbb{R}$. Log-convexity of $-{F'}$ then follows.
% \end{exmpl}
%-------------------------------------------------------------------------------
Because the penalty term $$\llpen$$ is convex in $\shiftparm$, it can
be added to the negative log-likelihood while conserving convexity.
However, monotonicity of $\h$ imposes inequality constraints on the parameters
of the baseline transformation as illustrated in equation~\eqref{eq:const}.
The elegance of domain-specific language based optimizers comes to play when adding
these and potential other inequality or equality constraints to the objective
function, which will be showcased in Section~\ref{subsec:otherconstr}.
Thus, the optimisation routines implemented in package \pkg{CVXR} can be
applied for computing maximum likelihood estimates of the parameters of
model (\ref{fm:trafo}).

\subsection{Model-based optimization} \label{subsec:mbo}

The predictive capabilities of regularized regression models heavily depend on
the hyperparameters $\alpha$ and $\lambda$. Hyperparameter tuning
can be addressed by a multitude of methods with varying computational complexity,
advantages and disadvantages. Naive or random grid search for more than one
tuning parameter are computationally demanding, especially if the objective function
is expensive to evaluate. Model-based optimization circumvents this issue by
fitting a surrogate model, usually a Gaussian process, to the objective function.
The objective function is evaluated at an initial, \eg a random latin hypercube,
design, to which the Gaussian process is subsequently fit. The surrogate model
then proposes the next set of hyperparameters to evaluate the objective function
at by some  infill criterion \citep{mbopaper}.
\citet{mlrMBO} implement model-based optimization
for multi-objective blackbox functions in the \pkg{mlrMBO} package. The objective
function can in theory be vector-valued and the tuning parameter spaces may be
categorical. In \pkg{tramnet} the objective function is the cross-validated
log-likelihood optimized using a Kriging surrogate model with expected
improvement as the infill criterion. Model-based optimization for hyperparameter
tuning is illustrated in section \ref{sec:prostateanalysis}.

\subsection{Basic usage} \label{subsec:usage}

The initial step is fitting a potentially stratified, transformation model of
the form
<<eval=FALSE>>=
m1 <- tram(y | s ~ 1, ...)
@
omitting all explanatory variables. This sets up the basis expansion for the
transformation function, whose regression coefficients will not be penalized, as
mentioned in section \ref{regul}.
Additionally, \cmd{tramnet} needs a model matrix including the predictors, whose
regression coefficients ought to be penalized. For numerical reasons it is useful
to provide a scaled model matrix, instead of the original data, such that every
parameter is equally affected by the regularization.
Lastly, \cmd{tramnet} will need the tuning parameters $\alpha \in [0, 1]$ and
$\lambda \in \mathbb{R}^+$, with $\alpha$ representing a mixing parameter and
$\lambda$ controlling the extent of regularization. Setting $\lambda = 0$ will
result in an unpenalized model, regardless of the value of $\alpha$.
<<eval=FALSE>>=
x <- model.matrix(~ 0 + x, ...)
x_scaled <- scale(x)
mt <- tramnet(model = m1, x = x_scaled, lambda, alpha, ...)
@
S3 methods accompanying the \cls{tramnet} class will be discussed in section
\ref{sec:methods}.

\subsection{Censoring and likelihood forms} \label{subsec:modelclasses}

Specific combinations of $F$ and the form of censoring yield log-log-concave
log-likelihoods. Under these circumstances \pkg{tramnet} is not yet able to solve
the resulting optimization problem.
% In future versions \pkg{CVXR}, disciplined
% geometric programming will be added, which will solve this problem.
Table \ref{tab:models} indicates which model class can be fitted under what type
of censoring in the current version of \pkg{tramnet}.

\begin{table}
  \centering
  \caption{Combinations of model classes and censoring types that are possible
  	in the \pkg{tramnet} package. Due to missing disciplined geometric optimization
  	rules in \pkg{CVXR} not every combination of error distribution and censoring
  	type yield solvable objective functions. This will change with coming updates
  	in the \pkg{CVXR} package.}
  \label{tab:models}
  \begin{tabular}{@{}lcccc@{}}
  	\toprule
  	\multicolumn{1}{c}{\textbf{Model Class}} &
  	\multicolumn{4}{c}{\textbf{Censoring Type}} \\ \midrule
  	\textbf{} & \multicolumn{1}{l}{\textbf{Exact}} & \multicolumn{1}{l}{\textbf{Left}} &
  	\multicolumn{1}{l}{\textbf{Right}} & \multicolumn{1}{l}{\textbf{Interval}} \\
  	\cmidrule(l){2-5}
  	BoxCox        & \cmark & \xmark & \xmark & \xmark  \\
  	Colr          & \cmark & \cmark & \cmark & \xmark  \\
  	Coxph         & \cmark & \xmark & \cmark & \xmark  \\
  	Lehmann       & \cmark & \cmark & \xmark & \xmark  \\
  	\bottomrule
  \end{tabular}
\end{table}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Prostate cancer data analysis} \label{sec:prostateanalysis}
%-------------------------------------------------------------------------------

The regularized normal linear and extensions to transformed normal regression models
will be illustrated using the \code{Prostate} data set \citep{stamey1989prostate},
which was used by \citet{zou2005regularization} to highlight properties of the
elastic net.
<<BostonHousing2_data>>=
load("Prostate.rda")
Prostate$psa <- exp(Prostate$lpsa)
Prostate[, 1:8] <- scale(Prostate[, 1:8])
@
The data set contains \Sexpr{nrow(Prostate)} observations and \Sexpr{ncol(Prostate) - 1}
covariates. In the original paper the authors chose the log-transformed prostate
specific antigen concentration (\code{lpsa}) as the response and used the eight
remaining predictors log cancer volume (\code{lcavol}), log prostate weight
(\code{lweight}), age of the patient (\code{age}), log benign prostatic hyperplasia amount
(\code{lbph}), seminal vesicle invasion (\code{svi} coded as 1 for yes, 0 for no),
log capsular penetration (\code{lcp}), Gleason score (\code{gleason}) and percentage
Gleason score 4 or 5 (\code{pgg45}) as covariates.

\subsection{Linear and Box-Cox type regression models}

\citet{zou2005regularization} imposed an assumption on the conditional distribution of the
response by log-transforming and fitting a linear model. In the following it is
shown that the impact of this assumption may be assessed by estimating the baseline
transformation from the data, followed by a comparison with the
log-transformation applied by \citet{zou2005regularization}. The linear models in \code{lpsa} and $\log($\code{psa}$)$
are compared to transformation models with basis expansions in both $\log($\code{psa}$)$
and \code{psa}, while specifying conditional normality of the transformed response.
Additionally, the models are compared to an alternative implementation of regularized
normal linear regression in \pkg{penalized}. Five different models will be used
to illustrate important facettes of transformation models, including parametrization and
interpretation. The models are summarized in Table \ref{tab:mods} and will be
elaborated on throughout this section. The comparison is based on unpenalized
models first. Later, the section highlights the penalized models together with
hyperparameter tuning.

\begin{table}[]
\centering
\caption{Summary of the five models illustrated in section \ref{sec:prostateanalysis},
including their name throughout the manuscript, the \textsf{R}~code to fit them
and the mathematical formulation of their conditional cumulative distribution function.
For comparison \code{mp} is included as an ordinary linear model, which is equivalent
to model \code{mt} in terms of log-likelihood, but differs in the parametrization of
the transformation function $\h$ and thus yields scaled coefficient estimates
(\emph{cf.} Table \ref{tab:prostatemods}).
Model \code{mtp} is a linear model parametrized in terms of a Bernstein basis of
maximum order 1. This will yield the same coefficient estimates as \code{mt} but
a log-likelihood that is comparable to models \code{mt1} and \code{mt2}, whose
transformation functions are parametrized in terms of higher order Bernstein bases.
The \code{log\_first} argument specifies whether the basis expansion is calculated
on the log-transformed or untransformed response.
}\label{tab:mods}
\renewcommand{\arraystretch}{1.3}
\resizebox{\textwidth}{!}{
\begin{tabular}{lll}
\toprule
\textbf{Name} & \textbf{Code} & \textbf{Model for} $\pYx(\ry \given \rx)$ \\
\midrule
\code{mp}     & \code{penalized(response = lpsa, penalized = x)}            &
  $\pN\left(\eparm_1 + \eparm_2\log(\ry) - \rx^\top\shiftparm \right)$  \\
\code{mt}    & \code{Lm(lpsa $\sim$ .)}                                    &
  $\pN\left(\eparm_1 + \eparm_2\log(\ry) - \rx^\top\shiftparm \right)$  \\
\code{mtp}     & \code{BoxCox(psa $\sim$ ., log\_first = TRUE, order = 1)}   &
  $\pN\left(\bern{1}(\log(\ry))^\top\parm - \rx^\top\shiftparm \right)$ \\
\code{mt1}    & \code{BoxCox(psa $\sim$ ., log\_first = TRUE, order = 7)}   &
  $\pN\left(\bern{7}(\log(\ry))^\top\parm - \rx^\top\shiftparm \right)$ \\
\code{mt2}    & \code{BoxCox(psa $\sim$ ., log\_first = FALSE, order = 11)} &
  $\pN\left(\bern{11}(\ry)^\top\parm - \rx^\top\shiftparm \right)$ \\
\bottomrule
\end{tabular}
}
\end{table}

<<BH_Linear1, eval=av>>=
fm_Pr <- psa ~ lcavol + lweight + age + lbph + svi + lcp + gleason + pgg45
fm_Pr1 <- update(fm_Pr, ~ 0 + .)
x <- model.matrix(fm_Pr1, data = Prostate)
@
The normal linear regression model is implemented in \pkg{tram}'s \cmd{Lm} function.
\cmd{Lm}'s parametrization differs from the usual linear model, hence caution has
to be taken when interpreting the resulting regression coefficients $\shiftparm$.
In order to compare the results to an equivalent, already existing implementation,
the same model is fitted using \pkg{penalized}.
<<BH_Linear2, results='hide', eval=av>>=
m0 <- Lm(lpsa ~ 1, data = Prostate)
mt <- tramnet(m0, x = x, alpha = 0, lambda = 0)
mp <- penalized(response = Prostate$lpsa, penalized = x,
                lambda1 = 0, lambda2 = 0)
@
A linear model of the form
\begin{eqnarray*}
\rY = \tilde{\alpha} + \rx^\top \tilde{\shiftparm} + \varepsilon,
    \quad \varepsilon \sim \ND(0, \sigma^2)
\end{eqnarray*}
can be understood as a transformation model through reparametrization as
\begin{eqnarray*} \label{Lm}
  \Prb(\rY \le  \ry \given \rX = \rx) = \Phi\left(\eparm_1 + \eparm_2 \ry -
    \rx^\top \shiftparm\right).
\end{eqnarray*}
Here $\eparm_1 = -\tilde{\alpha} / \sigma$ is a reparametrized intercept term,
$\eparm_2 = 1 / \sigma$ is the slope of the baseline transformation and the
regression coefficients $\shiftparm = \tilde{\shiftparm} / \sigma$ represent
scaled shift terms, influencing only the intercept. To recover the usual
parametrization \cmd{tramnet::coef.Lm} offers the \code{as.lm = TRUE} argument.
<<BH_Linear3, eval=av>>=
cfx_tramnet <- coef(mt, as.lm = TRUE)
@
The transformation function for the linear model is depicted in Figure
\ref{fig:Prostate_trafos} (pink line).
Because a linear baseline transformation imposes restrictive assumptions on the
response's conditional distribution, it is advantageous to replace the linear baseline
transformation by a more flexible one. In the case of the Box-Cox type regression
model the linear baseline transformation $\h(\ry) = \eparm_1 + \eparm_2 \log\ry$
is replaced by the basis expansion $\h(\ry) = \bern{7}(\log\ry)^\top \parm$.
<<BH_BoxCox1, eval=av>>=
ord <- 7 # flexible baseline transformation
m01 <- BoxCox(psa ~ 1, data = Prostate, order = ord,
              extrapolate = TRUE, log_first = TRUE)
mt1 <- tramnet(m01, x = x, alpha = 0, lambda = 0)
@
The Box-Cox type regression model is then estimated with the \cmd{BoxCox} function,
while specifying the appropriate maximum order of the Bernstein polynomial.
Because, the more flexible transformation slightly deviates from being linear, the normal
linear model yields a smaller log-likelihood (\emph{cf.} Table \ref{tab:prostatemods}).
To make sure that this improvement is not due to the increased number of parameters
and hence overfitting, the models predictive capacities could be compared via
cross-validation.

These results hold for the pre-specified log transformation of the response and
a basis expansion thereof. Instead of prespecifying the log-transformation, its
`logarithmic nature' can be estimated from the data. Afterwards one can compare the
deviation from a log-linear baseline transformation graphically and by inspecting
the predictive performance of the model in terms of the out-of-sample log-likelihood.
<<Prostate_baseline_trafo, eval=av>>=
m02 <- BoxCox(psa ~ 1, order = 11, data = Prostate, extrapolate = TRUE)
mt2 <- tramnet(m02, x = x, lambda = 0, alpha = 0)
@
Indeed the baseline transformation in Figure \ref{fig:Prostate_trafos} is
similar to the basis expansion in the log-transformed response upon visual inspection.
Because \code{mt} is estimated using the log-transformed response and \code{mt1}
and \code{mt2} are based on the original scale of the response, the resulting
model log-likelihoods are not comparable. To overcome this issue, one can fit a
Box-Cox type model with maximum order 1, as this results in a linear, but
alternatively parametrized baseline transformation.
<<linear_boxcox, eval=av>>=
m0p <- BoxCox(psa ~ 1, order = 1, data = Prostate, log_first = TRUE)
mtp <- tramnet(m0p, x = x, lambda = 0, alpha = 0)
@
Figure \ref{fig:Prostate_trafos} plots the three distinct baseline transformations
resulting from models \code{mt}, \code{mt1} and \code{mt2}. The initial assumption
to model the prostate specific antigen concentration linearily on the log-scale
seems to be valid when comparing the three transformation functions.
The linear transformation in \code{lpsa} used in \code{mt} and the basis expansion
in $\log($\code{psa}$)$ (\code{mt1}) are almost indistinguishable and yield very
similar coefficient estimates, as well as log-likelihoods (\emph{cf.} Table
\ref{tab:prostatemods}, \code{mtp} \emph{vs.} \code{mt1}). The basis expansion
in \code{psa} (\code{mt2}) is expected to be less stable due to the highly skewed
untransformed response. This is reflected in Figure \ref{fig:Prostate_trafos},
where the baseline transformation deviates from being linear towards the bounds
of the response's support. However, the log-linear behaviour of $\h$ was clearly
captured by this model and further supports the initial assumption of conditional
log-normality of the response. For the same reasons, the resulting log-likelihood
of \code{mt2} is smaller than for \code{mt1} (Table \ref{tab:prostatemods}). Taken together,
this exemplary analysis highlights the flexibility and usefulness of transformation
models to judge crucial modelling assumptions.

%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---
\begin{figure}[!ht]
\centering
<<BH_Linear5, echo=FALSE, fig.height=3, fig.width=5, eval=av>>=
K <- 1e3
nd <- Prostate[rep(1, K),]
nd$lpsa <- seq(min(Prostate$lpsa), max(Prostate$lpsa), length.out = K)
nd$psa <- exp(nd$lpsa)
nd$pred1 <- predict(mt, type = "trafo", newdata = nd)
nd$pred2 <- predict(mt1, type = "trafo", newdata = nd)
nd$pred3 <- predict(mt2, type = "trafo", newdata = nd)

col3 <- qualitative_hcl(3)
xyplot(pred1 + pred2 + pred3 ~ lpsa, data = nd, type = "l", lwd = 1.5,
       col = col3, ylab = expression(italic(h)(italic(y))),
       xlab = expression(log(psa)), # asp = 1,
       # main = "Estimating the baseline transformation from data",
       panel = function(x, y, ...) {
         panel.grid(h = -1, v = -1)
         panel.xyplot(x, y, ...)
         panel.rug(x = Prostate$lpsa, end = 0.02, col = "grey60")
       }, auto.key = list(
         text = c("mt", "mt1", "mt2"),
         points = FALSE, corner = c(1, 0), x = 0.95, y = 0.05, col = col3, cex = 0.75
         ), scales = list(tck = c(1, 0)),
       par.settings = list(layout.heights = list(bottom.padding = 0,
                                                 top.padding = 0))
       )
@
\caption{Comparison of different choices for the baseline transformation of the
response (prostate specific antigen concentration) in the \code{Prostate} data.
The original analysis prespecified a log-transformation of the response and then
assumed conditional normality on this scale. Hence the baseline transformation
of \code{mt} is of the form: $\h(\mathrm{lpsa}) = \eparm_1 + \eparm_2 \cdot
\mathrm{lpsa}$. Now one can allow a more flexible transformation function in
$\log($\code{psa}$)$ to judge any deviations of $\h(\log(\mathrm{psa}))$ from
linearity, leading to a baseline transformation in terms of basis functions:
$\bern{7}(\log(\mathrm{psa}))^\top \parm$ in \code{mt1}. Lastly, instead of
presuming a log-transformation one could estimate the baseline transformation
from the raw response (\code{psa}), i.e. $\h(\mathrm{psa}) = \bern{11}
(\mathrm{psa})^\top \parm$ in \code{mt2}. In this case, a higher order basis
expansion was chosen to account for the skewness of the raw response. Notably,
all three baseline transformations are fairly comparable. The basis expansion
in \code{psa} deviates from being log-linear towards the boundaries of the
response's support, as there are only few observations.}
\label{fig:Prostate_trafos}
\end{figure}
%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---

%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---
\begin{table}
\centering
\caption{Comparison of the three transformation models on the \code{Prostate} data.
Coefficient estimates are shown for each model, together with the in-sample
log-likelihood in the last column. The first three models, \code{mp}, \code{mt} and
\code{mtp} use a linear baseline transformation in \code{lpsa} and $\log($\code{psa}$)$,
respectively. The \code{mp} model was fit using \pkg{penalized} and gives the scaled
version of the regression coefficients in \code{mt}, but the same log-likelihood.
At the same time, \code{mt} and \code{mtp} differ only in their response variable
and its subsequent log-transformation in \code{mtp}, yielding the same coefficient
estimates but a different log-likelihood. Models \code{mt1} and \code{mt2} allow a
flexible basis expansion in $\log($\code{psa}$)$ and \code{psa}, respectively.
Model \code{mt1}, allowing for a flexible basis expansion in \code{lpsa}, fits the
data the best, however the resulting coefficient estimates are similar for all
models.} \label{tab:prostatemods}
<<table1, results='asis', echo=FALSE, eval=av>>=
cfx_tab <- as.data.frame(rbind(coef(mp, which = "penalized"), coef(mt), coef(mtp),
                               coef(mt1), coef(mt2)))
cfx_tab$` ` <- c(loglik(mp), logLik(mt), logLik(mtp), logLik(mt1), logLik(mt2))
rownames(cfx_tab) <- paste0("\\code{", c("mp", "mt", "mtp", "mt1", "mt2"), "}")
kbl <- knitr::kable(as.data.frame(cfx_tab), row.names = TRUE, booktabs = TRUE,
                    linesep = "", format = "latex", digits = 2, escape = FALSE)
kableExtra::add_header_above(kbl, header = c("Model" = 1, "Coefficient estimates" = 8,
                                             "logLik" = 1), escape = FALSE,
                             bold = TRUE)
@
\end{table}
%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---%---


\subsection{Hyperparameter tuning} \label{sec:tuning}

This section features cross-validation, model-based optimization and profiling functions
for hyperparameter tuning, whose appropriate values are highly problem-dependent
and hard to know in advance. \pkg{tramnet} implements naive grid search and model-based
optimization in the functions \cmd{cvl\_tramnet} and \cmd{tramnet\_mbo}, respectively.
In the framework of regularized transformation models it is very natural to choose
the out-of-sample log-likelihood as the objective function, because
the notion of a mean square loss does not make sense for survival, let alone
censored outcomes. The out-of-sample log-likelihood is in fact the log score,
which is a proper scoring rule \citep{Gneiting_Raftery_2007}.
<<cvl_prof_setup, eval=FALSE>>=
m0 <- BoxCox(lpsa ~ 1, data = Prostate, order = 7, extrapolate = TRUE)
mt <- tramnet(m01, x = x, alpha = 1, lambda = 0)
@
\pkg{tramnet} offers cross-validation in \cmd{cvl\_tramnet}, comparable to the
\cmd{optL1} and \cmd{optL2} functions in \pkg{penalized}, which takes a sequence
of values for $\lambda$ and $\alpha$ and performs a simple --~and arguably slow~--
grid search. Per default it computes 2-fold cross-validation, the user is encouraged,
however, to judge the resulting bias-variance trade-off accordingly.
<<cvl_tramnet, eval=FALSE>>=
lambdas <- c(0, 10^seq(-4, log10(15), length.out = 4))
cvlt <- cvl_tramnet(object = mt, fold = 2, lambda = lambdas, alpha = 1)
@
In order to compare cross-validation across multiple packages and functions it
is also possible to supply the folds for each row in the design matrix as a
numeric vector, as for example returned by \cmd{penalized::optL1}.
<<cvl_tramnet2, eval=FALSE>>=
pen_cvl <- optL1(response = lpsa, penalized = x, lambda2 = 0, data = Prostate,
                 fold = 2)
cvlt <- cvl_tramnet(object = mt, lambda = lambdas, alpha = 1,
                    folds = pen_cvl$fold)
@
The resulting object is of class \cls{cvl\_tramnet} and contains a table for the
cross-validated log-likelihoods for each fold and the sum thereof, the `optimal'
tuning parameter constellation which resulted in the largest cross-validated
log-likelihood, tables for the cross-validated regularization paths, the
folds and lastly the full fit based on the `optimal' tuning parameters. Additionally,
the resulting object can be used to visualize the log-likelihood and coefficient
trajectories. These trajectories highly depend on the chosen folds and the user
is referred to the full profiling functions discussed in section \ref{subsec:prof}.

\subsubsection{Model-based optimization} \label{subsec:prostatembo}

In contrast to naive grid search, model-based optimization comprises more
elegant methods for hyperparameter tuning. \pkg{tramnet} offers the \cmd{mbo\_tramnet}
and \cmd{mbo\_recommended} functions. The former implements Kriging-based hyperparameter
tuning for the elastic net, the LASSO and ridge regression. \cmd{mbo\_tramnet}
takes a \cls{tramnet} object as input and computes the cross-validated log-likelihood
based on the provided \code{fold} or \code{folds} arugment. The initial design
is a random latin hypercube design with \code{n\_design} rows per parameter.
The number of sequential fits of the surrogate models is specified through
\code{n\_iter} and the range of the tuning parameters can be specified by
\code{max/min} arguments. The default infill criterion is expected improvement.
However, \citet{mlrMBO} encourage the use of the lower confidence bound over
expected improvement, which can be achieved in \cmd{mbo\_tramnet} by specifying
\code{opt\_crit~=~makeMBOInfillCritCB()}. 10-fold cross-validation is used to compute
the objective function for the initial design and each iteration. The recommended
model is then extracted using \cmd{mbo\_recommended}.
<<prostate_mbo, results="hide", eval=FALSE>>=
tmbo <- mbo_tramnet(mt, obj_type = "elnet", fold = 10)
mtmbo <- mbo_recommended(tmbo, m0, x)
@
<<load_from_dat, echo=FALSE>>=
if (file.exists("cache.rda")) {
    load("cache.rda")
} else {
  <<prostate_mbo>>
  save(tmbo, mtmbo, file = "cache.rda")
}
@
Unlike in the previous section, one can directly optimize the tuning parameters
in an elastic net problem instead of optimizing over one hyperparameter at a time
or having to specify LASSO or ridge regression \emph{a priori}. The output of
\cmd{mbo\_tramnet} is quite verbose and can be shortened by using the helper
function \cmd{print\_mbo}.
<<prostate_mbo_recommended, eval=av>>=
print_mbo(tmbo)
@
Interpreting the output, model-based optimization suggests an unpenalized model
with $\alpha = \Sexpr{round(tmbo$x$alp, 2)}$ and $\lambda = \Sexpr{round(tmbo$x$lmb, 2)}$.
This result stresses the advantages of model-based optimization over naive or random
grid search in terms of complexity and computational efficiency. In the end, the
proposed model is unpenalized and thus does not introduce sparsity in the regression
coefficients.
<<prostate_coefs, eval=av>>=
coef(mtmbo)
summary(mtmbo)$sparsity
@

\subsubsection{Regularization paths} \label{subsec:prof}

As discussed before, it may be useful to inspect the full regularization paths
over the tuning parameters $\lambda$ and $\alpha$. Akin to the functions \cmd{profL1}
and \cmd{profL2} in package \pkg{penalized}, \pkg{tramnet} offers \cmd{prof\_lambda}
and \cmd{prof\_alpha}. Since these functions take a fitted model of class
\cls{tramnet} as input, which is internally updated it is crucial to correctly
specify the other tuning parameter in the model fitting step. In the example to
come, \code{mt} was fit using $\alpha = 1$ and $\lambda = 0$ resulting in a
LASSO penalty only when profiling over $\lambda$. The resulting profile is depicted
in Figure \ref{fig:prof}.
% The left panel shows the log-likelihood
% trajectory, which may not always be interesting to look at and is suppressed by
% default but can be toggled by \code{plot\_logLik = TRUE}.
<<profiling, eval=FALSE>>=
pfl <- prof_lambda(mt)
@
<<load_from_dat2, echo=FALSE, eval=av>>=
if (file.exists("cache2.rda")) {
    load("cache2.rda")
} else {
  <<cvl_prof_setup>>
  <<profiling>>
  save(pfl, file = "cache2.rda")
}
@
\cmd{prof\_lambda} takes \code{min\_lambda}, \code{max\_lambda} and \code{nprof}
as arguments and internally generates an equi-spaced sequence from \code{min\_lambda}
to \code{max\_lambda} on the log scale of length \code{nprof}. By default this
sequence ranges from $0$ to $15$ and is of length $5$.
<<profplotcode, eval=FALSE, echo=TRUE>>=
plot_path(pfl, plot_logLik = FALSE, las = 1, col = coll)
@

\begin{figure}
\centering
<<plot_setup, include=FALSE, echo=FALSE, eval=av>>=
coll <- qualitative_hcl(n = ncol(pfl$cfx))
@
<<profiling_plot, fig.height=4, fig.width=6.5, out.width="1\\textwidth", echo=FALSE, eval=av>>=
plot_path(pfl, plot_logLik = FALSE, las = 1, col = coll)
@
\caption{Full regularization paths for the tuning parameter $\lambda$ using the
default values of \cmd{plot\_path}.}
\label{fig:prof}
\end{figure}

\subsection{Additional constraints} \label{subsec:otherconstr}

In some applications, the specification of additional constraints on the shift
parameters $\shiftparm$ are of interest. Most commonly, either positivity or
negativity for some or all regression coefficients is aimed at. In \pkg{tramnet}
additional inequality constraints can be specified via the \code{constraints} argument,
which are internally handled as \code{constraints[[1]] \%*\% beta $>$ constraints[[2]]}.
Hence, to specify the constraint of strictly positive regression coefficients,
one would supply an identity matrix of dimension $p$ for the left hand side and
the zero $p$-vector for the left hand side, as done in the following example.
<<addconst, eval=av>>=
m0 <- BoxCox(lpsa ~ 1, data = Prostate, extrapolate = TRUE)
mt <- tramnet(m0, x, alpha = 0, lambda = 0, constraints = list(diag(8),
                                                               rep(0, 8)))
coef(mt)
@
The coefficients with a negative sign in the model without additional positivity
constraints now shrink to zero and the other coefficient estimates change
as well.
<<addconst_sparsity, eval=av>>=
summary(mt)$sparsity
@
One can compare this model to the implementation in \pkg{tram}, where it is also
possible to specify linear inequality constraints on the regression coefficients
$\shiftparm$. Here, it is sufficient to specify
\code{constraints = c("age >= 0", "lcp >= 0")}
for the two non-positive coefficient estimates, due to the convexity of the
underlying problem.
<<addconst_tram_cmp, eval=av>>=
m <- BoxCox(lpsa ~ . - psa, data = Prostate, extrapolate = TRUE,
            constraints = c("age >= 0", "lcp >= 0"))
max(abs(coef(m) - coef(mt, tol = 0)))
@
Indeed, both optimizers arrive at virtually the same coefficient estimates.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{\code{S3} Methods} \label{sec:methods}
%-------------------------------------------------------------------------------

Building on the \code{S3} infrastructure of the packages \pkg{mlt} and \pkg{tram},
this package provides corresponding methods for the following generics: \cmd{coef},
\cmd{logLik}, \cmd{plot}, \cmd{predict}, \cmd{simulate} and \cmd{residuals}.
The methods' additional \cls{tramnet}-specific arguments will be briefly discussed
in this section.

\cmd{coef.tramnet} suppresses the baseline transformation's coefficient estimates
$\hat\parm$ by default and considers shift parameter estimates $\hat\shiftparm$
below $10^{-6}$ as $0$ to stress the selected variables only.
This threshold can be controlled by the \code{tol} argument.
Hence, \code{coef(mt, with\_baseline = TRUE, tol = 0)} returns all coefficients.
<<coef_method, eval=av>>=
coef(mtmbo, with_baseline = TRUE, tol = 0)
@
The \cmd{logLik.tramnet} method allows the log-likelihoods re-computation under
new data (i.e. out-of-sample) and different coefficients (\code{parm}) and weights
(\code{w}), as illustrated below.
<<logLik_method, eval=av>>=
logLik(mtmbo)
cfx <- coef(mtmbo, with_baseline = TRUE, tol = 0)
cfx[5:8] <- 0.5
logLik(mtmbo, parm = cfx)
logLik(mtmbo, newdata = Prostate[1:10,])
logLik(mtmbo, w = runif(n = nrow(mtmbo$x)))
@
In the spirit of \pkg{mlt}'s plotting methods for classes \cls{mlt} and \cls{ctm},
\cmd{plot.tramnet} offers diverse plotting options for objects of class \cls{tramnet}.
The specification of new data and the type of plot is illustrated in the follwing code
chunk and Figure \ref{fig:plot}.
<<include=FALSE, eval=FALSE>>=
mtmbo$model$data <- mtmbo$model$data[1:10,,drop=FALSE]
mtmbo$x <- mtmbo$x[1:10,,drop=FALSE]
@
<<plot_method_chunk, fig.height=7, fig.width=5, eval=FALSE, eval=av>>=
par(mfrow = c(3, 2)); K <- 3e2
plot(mtmbo, type = "distribution", K = K, main = "A") # A, default
plot(mtmbo, type = "survivor", K = K, main = "B") # B
plot(mtmbo, type = "trafo", K = K, main = "C") # C
plot(mtmbo, type = "density", K = K, main = "D") # D
plot(mtmbo, type = "hazard", K = K, main = "E") # E
plot(mtmbo, type = "trafo", newdata = Prostate[1, ], col = 1, K = K, main = "F") # F
@

\begin{figure}
\centering
<<plot_method_fig, fig.height=7, fig.width=5, echo=FALSE, eval=av>>=
par(mfrow = c(3, 2)); K <- 3e2
plot(mtmbo, type = "distribution", K = K, main = "A") # A, default
plot(mtmbo, type = "survivor", K = K, main = "B") # B
plot(mtmbo, type = "trafo", K = K, main = "C") # C
plot(mtmbo, type = "density", K = K, main = "D") # D
plot(mtmbo, type = "hazard", K = K, main = "E") # E
plot(mtmbo, type = "trafo", newdata = Prostate[1, ], col = 1, K = K, main = "F") # F
@
\caption{Illustration of \cmd{plot.tramnet}'s versatility in visualizing the
response's estimated conditional distribution on various scales including cdf,
survivor, transformation scale and pdf. Note that by default the plot is produced
for each row in the design matrix. In unstratified linear transformation models
this leads to shifted versions of the same curve on the transformation function's
scale.
A: Estimated conditional distribution function for every observation.
B: Estimated conditional survivor function for every observation. The conditional
survivor function is defined as $S(\ry \given \rx) = 1 - \pY(\ry \given \rx)$.
C: Conditional most likely transformation for every observation. Note that every conditional
transformation function is a shifted version of the same curve.
D: The conditional density for every observation can be calculated using
$\dY(\ry \given \rx) = F'(\basisy(\ry)^\top\parm - \rx^\top\shiftparm)\basisy'
  (\ry)^\top\parm$.
E: A distribution function is fully characterized by its hazard function
$\lambda(\ry \given \rx) = \dY(\ry \given \rx) / S(\ry \given \rx)$, which is depicted
in this panel.
F: The \code{newdata} argument can be used to plot the predicted most likely transformation
for the provided data, in this case the first row of the \code{Prostate} data.}
\label{fig:plot}
\end{figure}

The \cmd{predict.tramnet} method works in the same way as \cmd{predict.mlt} and
as such supports the types \code{trafo, distribution, survivor, density, logdensity,
hazard, loghazard, cumhazard} and \code{quantile}. For \code{type = "quantile"}
the corresponding probabilities (\code{prob}) have to be supplied as an argument
to evaluate the quantile function at.
<<predict_method, eval=av>>=
predict(mtmbo, type = "quantile", prob = 0.2, newdata = Prostate[1:5,])
@
Another method offered by this package implements parametric bootstrap-based
sampling. In particular, \cmd{simulate.tramnet} calls the \cmd{simulate.ctm}
function after converting the \cls{tramnet} object to a \cls{ctm} object.
<<simulate_method, eval=av>>=
simulate(mtmbo, nsim = 1, newdata = Prostate[1:5,], seed = 1)
@
Lastly, \cmd{residuals.tramnet} computes the generalized residual $r$ defined as
the score contribution for sample $i$ with respect to a newly introduced intercept
parameter $\gamma$, which is restricted to be zero. In particular,
\begin{eqnarray*}
  r = \frac{\partial \ell\left(\parm,\shiftparm,\gamma;\ry, \rs, \rx\right)}{\partial
    \gamma} \bigg\rvert_{\gamma = 0}
\end{eqnarray*}
yields the generalized residual with respect to $\gamma$ for the model
\begin{eqnarray*}
  \pY(\ry \given \rs, \rx) = F\left(\h(y \mid \rs) - \rx^\top \shiftparm - \gamma \right).
\end{eqnarray*}
<<residuals_method, eval=av>>=
residuals(mtmbo)[1:5]
@
In residual analysis and boosting it is common practice to check for associations
between residuals and covariates that are not included in the model.
In the prostate cancer example one could investigate, whether the variables \code{age}
and \code{lcp} should be included in the model. To illustrate this particular case
a non-parametric \cmd{independence\_test} from package \pkg{coin} can be used \citep{coinjss}.
First, the uncoditional transformation model \code{m0} is fit. Afterwards, the tramnet
models excluding \code{age} and \code{lcp} are estimated and their residuals
extracted using the \cmd{residuals.tramnet} method. Lastly, an independence test
using a maximum statistic (\code{teststat~=~"max"}) and a Monte Carlo based
approximation of the null distribution based on resampling $10^6$ times
(\code{distribution~=~approximate(1e6)}), yields the results printed below.
<<coin_illustration, eval=FALSE, eval=av>>=
library("coin")
m0 <- BoxCox(lpsa ~ 1, data = Prostate, extrapolate = TRUE)
x_no_age_lcp <- x[, !colnames(x) %in% c("age", "lcp")]
mt_no_age_lcp <- tramnet(m0, x_no_age_lcp, alpha = 0, lambda = 0)
r <- residuals(mt_no_age_lcp)
it <- independence_test(r ~ age + lcp, data = Prostate,
                        teststat = "max", distribution = approximate(1e6))
pvalue(it, "single-step")
@
<<load_from_dat3, echo=FALSE>>=
if (file.exists("cache3.rda")) {
    load("cache3.rda")
} else {
  <<coin_illustration>>
  tmp <- pvalue(it, "single-step")
  save(tmp, file = "cache3.rda")
}
tmp
@
Because there is substantial evidence against independence of the models' residuals
to either lcp or age, we can conclude that it is
worthwhile to include \code{age} and \code{lcp} in the model.
Packages \pkg{trtf} \citep{pkg:trtf} and \pkg{tbm} \citep{hothorn2019tbm,pkg:tbm}
make use of this definition of a residual for estimating and boosting transformation models,
trees and random forests. For more theoretical insight the reader is referred to
the above mentioned publications.

%EOF%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\bibliography{kook-hothorn}

\address{Lucas Kook, Torsten Hothorn\\
  Institut f\"ur Epidemiologie, Biostatistik und Pr\"avention \\
  Universit\"at Z\"urich \\
  Hirschengraben 84, CH-8001 Z\"urich\\
  Switzerland\\
  \email{lucasheinrich.kook@uzh.ch}, \email{Torsten.Hothorn@R-project.org}}

\newpage

<<packages, echo = FALSE, results = "hide", eval=av>>=
if (file.exists("packages.bib")) file.remove("packages.bib")
pkgversion <- function(pkg) {
    pkgbib(pkg)
    packageDescription(pkg)$Version
}
pkgbib <- function(pkg) {
    x <- citation(package = pkg, auto = TRUE)[[1]]
    b <- toBibtex(x)
    b <- gsub("Buehlmann", "B{\\\\\"u}hlmann", b)
    b[1] <- paste("@Manual{pkg:", pkg, ",", sep = "")
    if (is.na(b["url"])) {
        b[length(b)] <- paste("   URL = {http://CRAN.R-project.org/package=",
                              pkg, "}", sep = "")
        b <- c(b, "}")
    }
    cat(b, sep = "\n", file = "packages.bib", append = TRUE)
}
pkg <- function(pkg)
    paste("\\\\pkg{", pkg, "} \\\\citep[version~",
          pkgversion(pkg), ",][]{pkg:", pkg, "}", sep = "")
pkg("tram")
pkg("trtf")
pkg("tbm")
@

<<sesinf, echo=FALSE>>=
sessionInfo()
@

\end{article}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}