-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathljm_bis.tex
1778 lines (1368 loc) · 161 KB
/
ljm_bis.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass[titlepage,11pt]{article}
\usepackage{url,hyperref,microtype,subcaption}
\hypersetup{colorlinks=true,bookmarksnumbered,pdfborder={0 0 0.25}}
\usepackage{graphicx}
% ,citebordercolor={0.2667 0.4667 0.6667},citecolor=bluepurple,linkbordercolor={0.6667 0.2 0.4667},linkcolor=redpurple,urlbordercolor={0.1333 0.5333 0.2},urlcolor=green,breaklinks=true}
% \usepackage[vertfit=local]{breakurl}% only for arXiv
%% NOTE: Frontiers in Human Neuroscience does allow numbered sections, see for example
%% https://www.frontiersin.org/articles/10.3389/fnhum.2022.1032724/full
%% https://www.frontiersin.org/articles/10.3389/fnhum.2022.1077416/full
%% Other examples in the 2022 volume also show free section naming.
%
% In case of non-numbered sections:
% \makeatletter
% \AtBeginDocument{%
% \@ifdefinable{\myorg@nameref}{%
% \LetLtxMacro\myorg@nameref\nameref
% \DeclareRobustCommand*{\nameref}[1]{%
% \textbf{\color{green}\myorg@nameref{#1}}%
% }%
% }%
% }
% \makeatother
\usepackage[myheadings]{fullpage}
\usepackage{pmetrika}
\usepackage{pmbib}
\usepackage{natbib}
\usepackage{caption}
\usepackage{subcaption}
\usepackage{submit}
\setcounter{secnumdepth}{2}
\usepackage[usenames]{xcolor}
% Tol (2012) colour-blind-, print-, screen-friendly colours, alternative scheme; Munsell terminology
\definecolor{bluepurple}{RGB}{68,119,170}
\definecolor{blue}{RGB}{102,204,238}
\definecolor{green}{RGB}{34,136,51}
\definecolor{yellow}{RGB}{204,187,68}
\definecolor{red}{RGB}{238,102,119}
\definecolor{redpurple}{RGB}{170,51,119}
\definecolor{grey}{RGB}{187,187,187}
\definecolor{lgrey}{RGB}{221,221,221}
\definecolor{notecolour}{RGB}{68,170,153}
%\newcommand*{\puzzle}{\maltese}
\newcommand*{\puzzle}{{\fontencoding{U}\fontfamily{fontawesometwo}\selectfont\symbol{225}}}
\newcommand*{\wrench}{{\fontencoding{U}\fontfamily{fontawesomethree}\selectfont\symbol{114}}}
\newcommand*{\pencil}{{\fontencoding{U}\fontfamily{fontawesometwo}\selectfont\symbol{210}}}
\newcommand{\mynotew}[1]{{\color{notecolour}\wrench\ #1}}
\newcommand{\mynotep}[1]{{\color{notecolour}\pencil\ #1}}
\newcommand{\mynotez}[1]{{\color{notecolour}\puzzle\ #1}}
\usepackage{wrapfig}
\providecommand{\href}[2]{#2}
\providecommand{\eprint}[2]{\texttt{\href{#1}{#2}}}
\newcommand*{\amp}{\&}
% \newcommand*{\citein}[2][]{\textnormal{\textcite[#1]{#2}}%\addtocategory{extras}{#2}
% }
\newcommand*{\citein}[2][]{\textnormal{\citet[#1]{#2}}%\addtocategory{extras}{#2}
}
\newcommand*{\citebi}[2][]{\citet[#1]{#2}%\addtocategory{extras}{#2}
}
\newcommand*{\subtitleproc}[1]{}
\newcommand*{\chapb}{ch.}
%
%\def\UrlOrds{\do\*\do\-\do\~\do\'\do\"\do\-}%
\def\myUrlOrds{\do\0\do\1\do\2\do\3\do\4\do\5\do\6\do\7\do\8\do\9\do\a\do\b\do\c\do\d\do\e\do\f\do\g\do\h\do\i\do\j\do\k\do\l\do\m\do\n\do\o\do\p\do\q\do\r\do\s\do\t\do\u\do\v\do\w\do\x\do\y\do\z\do\A\do\B\do\C\do\D\do\E\do\F\do\G\do\H\do\I\do\J\do\K\do\L\do\M\do\N\do\O\do\P\do\Q\do\R\do\S\do\T\do\U\do\V\do\W\do\X\do\Y\do\Z}%
\makeatletter
%\g@addto@macro\UrlSpecials{\do={\newline}}
\g@addto@macro{\UrlBreaks}{\myUrlOrds}
\makeatother
\newcommand*{\arxiveprint}[1]{%
arXiv \doi{10.48550/arXiv.#1}%
}
\newcommand*{\mparceprint}[1]{%
\href{http://www.ma.utexas.edu/mp_arc-bin/mpa?yn=#1}{mp\_arc:\allowbreak\nolinkurl{#1}}%
}
\newcommand*{\haleprint}[1]{%
\href{https://hal.archives-ouvertes.fr/#1}{\textsc{hal}:\allowbreak\nolinkurl{#1}}%
}
\newcommand*{\philscieprint}[1]{%
\href{http://philsci-archive.pitt.edu/archive/#1}{PhilSci:\allowbreak\nolinkurl{#1}}%
}
\newcommand*{\doi}[1]{%
\href{https://doi.org/#1}{\textsc{doi}:\allowbreak\nolinkurl{#1}}%
}
\newcommand*{\biorxiveprint}[1]{%
bioRxiv \doi{10.1101/#1}%
}
\newcommand*{\osfeprint}[1]{%
Open Science Framework \doi{10.31219/osf.io/#1}%
}
\newcommand*{\osfproj}[1]{%
Open Science Framework \doi{10.17605/osf.io/#1}%
}
\usepackage{graphicx}
%% symbol = for equality statements within probabilities
\newcommand*{\mo}[1][=]{\mathord{\,#1\,}}
%%
\newcommand*{\sect}{\S}% Sect.~
\newcommand*{\sects}{\S\S}% Sect.~
\newcommand*{\chap}{ch.}%
\newcommand*{\chaps}{chs}%
\newcommand*{\bref}{ref.}%
\newcommand*{\brefs}{refs}%
%\newcommand*{\fn}{fn}%
\newcommand*{\eqn}{eq.}%
\newcommand*{\eqns}{eqs}%
\newcommand*{\fig}{fig.}%
\newcommand*{\figs}{figs}%
\newcommand*{\vs}{{vs}}
\newcommand*{\eg}{{e.g.}}
\newcommand*{\etc}{{etc.}}
\newcommand*{\ie}{{i.e.}}
\newcommand*{\ca}{{c.}}
\newcommand*{\foll}{{ff.}}
%\newcommand*{\viz}{{viz}}
\newcommand*{\cf}{{cf.}}
%\newcommand*{\Cf}{{Cf.}}
%\newcommand*{\vd}{{v.}}
\newcommand*{\etal}{{et al.}}
%\usepackage{fancybox}
\usepackage{framed}
% \newenvironment{description}{}{}
% \usepackage[shortlabels,inline]{enumitem}
% \SetEnumitemKey{para}{itemindent=\parindent,leftmargin=0pt,listparindent=\parindent,parsep=0pt,itemsep=\topsep}
% \setlist{itemsep=0pt,topsep=\parsep}
% \setlist[enumerate,2]{label=(\roman*)}
% \setlist[enumerate]{label=(\alph*),leftmargin=1.5\parindent}
% \setlist[itemize]{leftmargin=1.5\parindent}
% \setlist[description]{leftmargin=1.5\parindent}
\usepackage{bm}
\usepackage{mathtools}
\usepackage[main=british]{babel}\selectlanguage{british}
%\newcommand*{\langnohyph}{\foreignlanguage{nohyphenation}}
\newcommand{\langnohyph}[1]{\begin{hyphenrules}{nohyphenation}#1\end{hyphenrules}}
\usepackage[autostyle=false,autopunct=false,english=british]{csquotes}
\setquotestyle{american}
\newcommand*{\defquote}[1]{`\,#1\,'}
\usepackage{upgreek}
%% Macros
\DeclarePairedDelimiter\abs{\lvert}{\rvert}
\DeclarePairedDelimiter\set{\{}{\}} %}
\newcommand*{\p}{\mathrm{p}}%probability
\renewcommand*{\P}{\mathrm{P}}%probability
\newcommand*{\E}{\mathrm{E}}
%% The "\:" space is chosen to correctly separate inner binary and external relationss
\renewcommand*{\|}[1][]{\nonscript\:#1\vert\nonscript\:\mathopen{}}
\newcommand*{\di}{\mathrm{d}}
\newcommand*{\defd}{\coloneqq}
\newcommand*{\defs}{\eqqcolon}
\newcommand*{\Land}{\bigwedge}
\newcommand*{\zerob}[1]{\makebox[0pt][c]{#1}}
\newcommand*{\delt}{\updelta}
\newcommand*{\eU}{\bar{U}}
%% Variates:
\newcommand*{\age}{\texttt{Age}}
\newcommand*{\sex}{\texttt{Sex}}
\newcommand*{\apoe}{\texttt{APOE4}}
\newcommand*{\hv}{\texttt{HV}}
\newcommand*{\anart}{\texttt{ANART}}
\newcommand*{\cft}{\texttt{CFT}}
\newcommand*{\gds}{\texttt{GDS}}
\newcommand*{\ravltimm}{\texttt{RAVLT-imm}}
\newcommand*{\ravltdel}{\texttt{RAVLT-del}}
\newcommand*{\ravltrec}{\texttt{RAVLT-rec}}
\newcommand*{\tmta}{\texttt{TMTA}}
\newcommand*{\tmtb}{\texttt{TMTB}}
\newcommand*{\cad}{\texttt{cAD}}
\newcommand*{\smci}{\texttt{sMCI}}
\newcommand*{\yes}{\texttt{Y}}
\newcommand*{\no}{\texttt{N}}
\newcommand*{\predictors}{\texttt{\footnotesize predictors}}
\newcommand*{\predictand}{\texttt{\footnotesize predictand}}
\newcommand*{\dataset}{\texttt{\footnotesize dataset}}
\newcommand*{\auxinfo}{\texttt{\footnotesize aux\,info}}
\newcommand*{\population}{\texttt{\footnotesize population}}
\newcommand*{\diseasep}{\texttt{\footnotesize disease+}}
\newcommand*{\diseasem}{\texttt{\footnotesize disease\textminus}}
\newcommand*{\predictorp}{\texttt{\footnotesize predictor+}}
\newcommand*{\predictorm}{\texttt{\footnotesize predictor\textminus}}
\newcommand*{\trials}{\texttt{\footnotesize trials}}
%
\newcommand*{\ad}{Alzheimer's Disease}
\newcommand*{\mci}{Mild Cognitive Impairment}
\newcommand*{\ljm}{Ledley-Jaynes machine}
\newcommand*{\AD}{\textsc{ad}}
\newcommand*{\MCI}{\textsc{mci}}
\newcommand*{\adni}{\textsc{adni}}
% \setcounter{section}{-1}
\begin{document}
\begin{titlepage}
\title{Personalized prognosis \amp\ treatment\\ using Ledley-Jaynes machines:\\ An example study on conversion\\ from Mild Cognitive Impairment to Alzheimer's Disease}
\author{P.G.L. Porta~Mana\,\href{https://orcid.org/0000-0002-6070-0784}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}}}
\affil{Western Norway University of Applied Sciences, Norway}
\author{I.~Rye\,\href{https://orcid.org/0000-0003-4822-9480}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}}}
\affil{University of Oslo, Norway}
\author{A.~Vik\,\href{https://orcid.org/0000-0003-0374-9327}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}}}
\affil{Haukeland University Hospital, Norway}
\author{M.~Koci\'nski\,\href{https://orcid.org/0000-0001-7088-4823}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}},
A.~Lundervold\,\href{https://orcid.org/0000-0002-6819-6164}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}},
A.J.~Lundervold\,\href{https://orcid.org/0000-0001-8663-4247}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}}
}
\affil{University of Bergen, Norway}
\author{A.S.~Lundervold\,\href{https://orcid.org/0000-0001-8663-4247}{\protect\includegraphics[scale=0.25]{orcid_32x32.png}}}
\affil{Western Norway University of Applied Sciences, Norway}
\vspace{\fill}\centerline{\today}\vspace{\fill}
%\comment{This research was funded by .}
%\thanks{I would like to thank .}
\linespacing{1}
\contact{Correspondence should be sent to\\
\noindent E-Mail: [email protected]
% \noindent Phone: \break
% \noindent Fax: \break
% \noindent Website:
}
\end{titlepage}
\setcounter{page}{2}
\vspace*{2\baselineskip}
\RepeatTitle{Personalized prognosis \amp\ treatment using Ledley-Jaynes machines:\\ An example study on conversion\\ from Mild Cognitive Impairment to Alzheimer's Disease}\vskip3pt
\linespacing{1.5}
%% ITEM 8 [See the "howto.tex" file.]
\abstracthead
\begin{abstract}
%\mynotew{Alexander 24.01: A first pass below. Will make another pass tomorrow (Wednesday)}
The present work presents a statistically sound, rigorous, and model-free algorithm
%rigorous
% sound
% principled
% first-principled
% theoretically optimal
% supercalifragilisticexpialidocious
% ...
-- the \ljm\ -- for use in personalized medicine.
The \ljm\ is designed first to learn from a dataset of clinical with relevant predictors and predictands, and then to assist a clinician in the assessment of prognosis \amp\ treatment for new patients. It allows the clinician to input, for each new patient, additional patient-dependent clinical information, as well as patient-dependent information about benefits and drawbacks of available treatments. We apply the algorithm in a realistic setting for clinical decision-making, incorporating clinical, environmental, imaging, and genetic data, using a data set of subjects suffering from mild cognitive impairment and \ad. We show how the algorithm is theoretically optimal, and discuss some of its major advantages for decision-making under risk, resource planning, imputation of missing values, assessing the prognostic importance of each predictor, and more.
\begin{keywords}
Clinical decision making, Utility theory, Probability theory, Artificial Intelligence, Machine Learning, Base-rate fallacy
\end{keywords}
\end{abstract}\vspace{\fill}\pagebreak
\section{Introduction: Personalized prognosis, treatment, and computer algorithms}
\label{sec:intro}
\setcounter{subsection}{-1}
\subsection{Prologue: Four unique patients} % this fits quite well with the theme of the special issue of the journal
% \section{Four unique patients} % alternative title
\label{sec:four_patients}
Meet Olivia, Ariel, Bianca, Curtis.\footnote{These are purely fictive characters but with clinically realistic conditions; any reference to real persons is purely coincidental.}
% names from Shakespeare's plays
These four persons don't know each other, but they have something in common: they all suffer from a mild form of cognitive impairment, and are afraid that their impairment will turn into \ad\ within a couple of years. This is why each of them recently underwent a wide range of clinical examinations and tests, including brain imaging. % collective noun for "tests"
%clinical examination.
Today they are receiving the results. Based on their individual results, on available clinical statistical data, and on other relevant information, their clinician will
% indicate % [what does this mean?]
assess their risk of developing \ad. Then, together with the patients and their relatives, the clinician will make a decision among four distinct preventive-treatment options, available to each patient.\footnote{\label{fn:treatment}In the present paper, we use \enquote{prognosis} in a general sense to include also \enquote{diagnosis}, and \enquote{treatment} quite loosely to mean any course of action a clinician might take, including preventive treatment or even \enquote{additional tests}.} In these tasks, the clinician will be helped by a computer algorithm.
Besides a shared diagnosis of \mci\ and associated worries, these patients have other things in common -- but also some differences. Let's take Olivia as reference, and list the similarities and differences between her and the other three patients:
\begin{itemize}
\item Olivia and Ariel have identical results on the clinical and laboratory measures and age. They would also incur similar benefits and losses from the four available treatment options. Ariel, however, comes from a different geographical region, which presents a higher rate of conversion from \mci\ to \ad. And unlike Olivia, Ariel comes from a family with a heavy history of \ad. Because of this geographical and family background and some relevant statistics found in some publications, the clinician judges, before seeing the clinical data, that there's a 65\% probability that Ariel's cognitive impairment will convert to \ad.
\item Olivia and Bianca have identical clinical results and age; they also come from the same geographical region and have very similar family histories. In fact, we shall see that they have the same probability of developing \ad. Bianca, however, suffers from several allergies and additional clinical conditions that render some of the treatment options slightly riskier for her.
\item Olivia and Curtis have different results on all measures included in the clinical and laboratory examinations; Olivia is also more than 10 years older than Curtis. They otherwise come from the same geographical region, have very similar family histories, and would incur similar benefits or losses from the treatment options. Note that the imaging result for Curtis (hippocampal volume) is missing.
\end{itemize}
Considering the similarities and differences among these patients, which of the four available treatments will be optimal for each of them? The clinician will find that, despite the many factors in common among our four patients -- even despite Olivia's, Ariel's, and Bianca's identical clinical results, and Olivia's and Bianca's identical probability of conversion to \ad\ -- \emph{the optimal treatment for each patient is different from those for the other three} -- how come?
\subsection{Assistive computer algorithms: personalized input and output}
\label{sec:intro_purposes}
In the example above, we said \enquote{in these tasks, the clinician will be helped by a computer algorithm}. The need for such computational help is clear from the vast amount of clinical statistical data and the large number of clinical predictors today available to clinicians. But how should such an assistive computer algorithm be designed in order to take fully into account patient differences?
Although the example above concerns specifically \ad, the differences among patients
\begin{wrapfigure}{r}{0.25\linewidth}%
\vspace{-1.5em}%
\includegraphics[width=\linewidth]{OABC.png}%
\vspace{-2em}% \caption{\mynotep{draft, needs better font sizes}}\label{fig:OABC}
\end{wrapfigure}
described there apply more generally to most, if not all, clinical problems of prognosis and treatment. These differences can be broadly categorized as \enquote{difference in auxiliary or supplementary tests and background information} (Olivia vs Ariel), \enquote{difference in benefit and availability of treatments} (Olivia vs Bianca), \enquote{difference in clinical predictors} (Olivia vs Curtis), as schematized in the side figure. Each of these difference categories can affect the clinician's final choice of optimal treatment. An assistive algorithm should therefore reflect these differences in its input, its output, or both:
\begin{itemize}
\item In principle, there could be three kinds of input \enquote{slots}, where the clinician can input the current patient's specific values as regards clinical predictors, auxiliary information, and treatment options \amp\ benefits.
\item If input slots are only available for one or two of the categories above, the output should at least be of such a kind as to allow the clinician to integrate the current patient's specific values of the missing input categories.
\end{itemize}
To appreciate these requirements, one should contrast the input and output of many kinds of machine-learning classification algorithms. These typically only allow the input of a patient's clinical predictors, with no space for patient-specific auxiliary information or for adjustments of differences in background statistics (think of Olivia vs Ariel). And they typically output only a discrete prognostic label (say, \enquote{stable \mci} vs \enquote{conversion to \ad}), but no measure of the uncertainty about that label. Unfortunately, such output does not allow the clinician to assess treatment benefits and losses for the current patient, for this assessment depends not on the presence (present or future) of a disease, but on the \emph{risk} of its presence. We shall discuss these points at length in \sects~\ref{sec:population_step} and~\ref{sec:utilities_step}.
\medskip
The purpose of the present work is to present an assistive algorithm that meets the requirements above. This algorithm is designed to first learn from a dataset of clinical data with relevant predictors and predictand%
%%%% Note on "predictand"
%
% The quantity that we want to forecast is in various texts called "dependent variable" or "response variable". I personally don't like either. Both can be misleading. Surely AD-conversion is not "dependent" on cognitive variates, for example. Moreover we'll see that we are actually swapping the role of "independent" and "dependent" variables in this work. Same goes for "response". With a readership of medical scientists it's best to avoid the special connotations of these words, leaving them for variables that are indeed biologically dependent.
%
% This leaves us with "predictand", literally "what has to be predicted", which is exactly what AD-conversion is. This term is used in climate science and meteorology. It's good because it does not misleadingly imply that AD-conversion biologically depends or is a consequence of other variables.
%%%%
\footnote{literally \enquote{quantity to be predicted} or, more generally, inferred (\cf\ \emph{measurand} in metrology, \citealt[2.3]{jcgm1997_r2012}). We find this term, used in meteorology and climate science, more precise and less obscure or misleading than \enquote{dependent variate}, \enquote{response variate}, \enquote{outcome variable}, or similar.}, and then assist a clinician in the assessment of prognosis \amp\ treatment for new patients. It offers these ten features:
\begin{enumerate}
\item\label{item:feat_variates} It can work with clinical predictors comprising any combination of categorical and one-dimensional (continuous, discrete ordinal, unbounded or bounded, uncensored or censored) variates. The predictand can also be any combination of categorical and one-dimensional variates.
\item\label{item:feat_indifferentvariates} It treats predictor and predictand variates on equal footing, in the sense that the clinician can at any moment decide to infer some other variate given the rest.
% at the risk of violating causality
\item\label{item:feat_conditionalstats} It does not require that the current patient be considered in all respects as a member of the population underlying the learning dataset. The patient can be considered a member only conditionally on particular variate values.
\item\label{item:feat_inputs} It accepts three inputs:
\begin{enumerate}
\item\label{item:input_predictors} the clinical-predictor values for the current patient;
\item\label{item:input_aux} information about which predictand-predictor relationships learned from the dataset can be generalized to the current patient, and a prior prognostic probability representing auxiliary information;
\item\label{item:input_treats} a set of treatment options and their benefits and losses for the current patient.
\end{enumerate}
\item\label{item:feat_outputs} It yields three basic outputs:
\begin{enumerate}
\item\label{item:ouput_predictors} any prognostic probabilities or likelihoods about predictors and predictand desired by the clinician, given input~\ref{item:input_predictors};
\item\label{item:output_aux} final prognostic probabilities, given inputs~\ref{item:input_predictors}--\ref{item:input_aux};
\item\label{item:output_treats} optimal treatment, given inputs~\ref{item:input_predictors}--\ref{item:input_treats};
\end{enumerate}
\item\label{item:feat_modular_inout} Its input and outputs are modular, in the sense that the clinician can, for instance, give inputs~\ref{item:input_predictors}--\ref{item:input_aux} only, get a prognostic probability~\ref{item:output_aux} as output, and then proceed to treatment assessment by other means or algorithms.
\item\label{item:feat_imputation} It works even if predictor data are missing, both in the learning dataset and for the current patient.
\item\label{item:feat_uncertainty} It can quantify the uncertainty of its own outputs, allowing for sensitivity analyses. For example, it can tell how much a prognostic probability could have been different if the learning dataset had been larger, or whether the optimal treatment could be different if a particular missing predictor for the current patient were available.
\item\label{item:feat_forecast} It can make various kinds of long-term forecasts, such as frequency of prognoses with given probabilities, frequency of prescribed treatments, and similar -- provided that the dataset used for its learning can be considered representative of the full population.
\item\label{item:feat_unbeatable} It is model-free and extracts the maximal amount of information theoretically contained in the learning dataset, and therefore achieves the maximal prognostic power that the predictors can yield. In other words, it is unbeatable.
\end{enumerate}
Let us comment on some of these features. We believe that the capability of working with complex predictands, feature~\ref{item:feat_variates}, is important for a more realistic and nuanced approach to prognosis. In the case of \ad, for instance, a simple dichotomy \enquote{has disease} vs. \enquote{doesn't have disease} is possibly an oversimplification\footnote{see \eg\ \citet{edmondsetal2015,edmondsetal2020}, whose methods we find, however, inconclusive.}.
% [Luca:] They standardize neuropsych. scores based on age and education "based on regression coefficients derived from the normal control group" -- this can already introduce unrealistic approximations (did they check if linearity holds?) and terrible biases. Then the rest is the usual astrology with Bonferroni corrections etc. The graphs in their papers don't really tell me anything. In Fig.~3 of the 2020 paper I would have liked to see the datapoints, not just the linear-regression lines. The fact that they omitted the points is a little suspicious: would the points show that the linear regression is unrealistic?
Without feature~\ref{item:feat_conditionalstats}, the capability of auxiliary contextual information, the algorithm would be of no use in the often occurring case of patients having peculiar clinical contexts. The capability of dealing with missing data, feature~\ref{item:feat_imputation}, is important for a concrete implementation in a clinical setting, typically afflicted by imputation problems. Feature~\ref{item:feat_uncertainty} is extremely important for a clinician to assess the reliability of final decisions and honestly inform the patient of the possibility of unwanted outcomes. Finally, features~\ref{item:feat_indifferentvariates} and~\ref{item:feat_unbeatable}, the fact that this algorithm yields the maximal amount of information jointly contained in all variates, makes it valuable in general clinical research. The algorithm can, for example, forecast the maximal accuracy obtainable by \emph{any} inference algorithm based on the same predictors or a subset of those predictors; and it attains, by construction, that maximal accuracy. Further features of interest in Machine Learning are discussed in the next section.
We call this algorithm a \emph{\ljm}, for reasons explained in the next section. It is at the moment available as a collection of scripts\footnote{\doi{10.17605/osf.io/zb26t}, \url{https://github.com/pglpm/ledley-jaynes\_machine}.} in the R~programming language \citep{rcoreteam1995_r2023}, which we plan to assemble into a clinician-friendly R~package soon.
The methodology underlying this algorithm has been successfully demonstrated for \ad\ with a smaller number of predictors \citep{antonianovillalobosetal2014}, is used in many applications in astrophysics \citep{eht2019,eht2022,delpozzoetal2018}, and its advantages in neurocritical care have recently been emphasized \citep{jawaetal2023}.
The next section~\ref{sec:the_machine} gives an intuitive understanding of the \ljm's underlying principles and workings. The machine's concrete application is shown in \sect\,\ref{sec:application}, using the four-patient fictitious scenario of \sect\,\ref{sec:four_patients} as a concrete example, and subsection~\ref{sec:additional_results} discusses further applications to general medical research. A summary and discussion is given in \sect\,\ref{sec:discussion}. Mathematical details and proofs on which the present work is grounded are given in a companion technical note\footnote{\url{https://github.com/pglpm/ledley-jaynes\_machine/raw/main/ledley-jaynes\_machine.pdf}}, which also explains how to use the R~scripts.
We apologize to readers who may find some discussions or explanations too obvious, or some mathematical details too scarce. We wanted the present work to be accessible to a wide audience, from clinicians and students of medicine to researchers in machine learning and probability theory.
\bigskip% Newpage break just to help while writing the draft
\section{The \ljm}
\label{sec:the_machine}
This section can be especially of interest to readers from Machine Learning and Artificial Intelligence. It is largely independent of the next one, which describes the machine's application. It can be read after \sect\,\ref{sec:application} by readers who would like to see the machine in action first.
\subsection{Underlying theory and characteristics}
\label{sec:the_machine_principles}
The method to solve clinical decision-making problems such as the one of \sect\,\ref{sec:intro} %, taking into account individual differences among patients,
is none other than Decision Theory: the combination of probability theory and utility theory. It integrates available clinical statistical data with each patient's unique combination of clinical results, auxiliary information, and treatment benefits, in a mathematical framework, completely determined by basic self-consistency requirements.\footnote{\citet[\chaps~13--14]{jaynes1994_r2003}; \citet{vonneumannetal1944_r1955,cox1946,savage1954_r1972,luceetal1957,raiffaetal1961_r2000,raiffa1968_r1970,lindley1971_r1988,kreps1988}.}
Medicine has the distinction of having been one of the first fields to adopt Decision Theory, with the pioneering work by Ledley and Lusted \citep{ledleyetal1959,ledleyetal1959b,ledleyetal1960,lustedetal1960}, who also promoted its algorithmic implementation \citep[see especially \sect\,1-5 p.~21]{ledley1959,ledley1960}. Clinical decision-making is today explained and exemplified in brilliant textbooks for medical students and clinicians \citep{weinsteinetal1980,soxetal1988_r2013,huninketal2001_r2014}. An outline is given in \sect\,\ref{sec:expected_utility_theory}.
The \enquote{\ljm} is an algorithmic implementation, as dreamed by Ledley and Lusted, of the main calculations underlying the clinical decision-making process: from the comparison of a patient's specific predictors with the statistics offered by a clinical database, to the choice of optimal treatment. The name is a homage to Ledley -- who, incidentally, died of \ad\ \citep{shahetal2013} -- and to Jaynes \citeyearpar{jaynes1994_r2003}, who brilliantly explained the inductive logic underlying such a \enquote{robot}.
Decision theory is also the normative foundation for the construction of an Artificial Intelligence agent capable of rational inference and decision making \citetext{\citealt[part~IV]{russelletal1995_r2022}; \citealt[\chaps~1--2, 13--14]{jaynes1994_r2003}}. The \ljm\ can therefore be seen as an \emph{ideal} machine-learning algorithm. It is \enquote{ideal} in the sense of being free from special modelling assumptions (this is why we do not call it a \enquote{model}) and from limitations of informational output which affect most common machine-learning algorithms; not \enquote{ideal} in the sense of being impracticable. Quite the opposite, the present work shows that this ideal machine-learning algorithm can today be used in a wide range of inference problems at insubstantial computational cost. % It is preferable to popular algorithms such as neural networks, random forests, support-vector machines, which are unsuited to clinical decision-making problems owing to their input and output limitations and underlying special assumptions. We discuss this matter further in \sect\,\ref{sec:discussion}.
More concretely, the \ljm\ is ideal because \emph{it computes the probability distribution over all possible long-run frequency distributions from which the learning dataset can originate}, these frequency distributions being joint ones for all predictor and predictand variates.\footnote{This goes by the Sibylline technical name of \enquote{Bayesian nonparametric density regression}; see \eg\ \citet{rodriguezetal2009,bhattacharyaetal2010}; and Walker's \citeyearpar{walker2010} witty overview.} This is the maximum possible amount of information that can be extracted from the learning dataset, in a strict information-theoretic sense. From this probability distribution, the \ljm\ can indeed calculate any quantity outputted by other machine-learning algorithms. For example \citep[for terminology see \eg][\sect\,8.6]{murphy2012}:
\begin{itemize}
\item \emph{\enquote{Discriminative} algorithms:} the probability $\p(Y \| X)$ of any set of predictands $Y$ given any set of input predictors $X$.
\item \emph{\enquote{Generative} algorithms:} the probability $\p(X \| Y)$ of any set of input predictors $X$ given any set of predictand values $Y$.
More generally, the machine can compute any joint, marginal, or conditional probabilities $p(Z',Z'')$, $p(Z')$, $p(Z'\|Z'')$ for any desired subsets of variates $Z', Z''$.
\item \emph{Regression or classification:} the expected value $\E(Y\|X)$ of any set of variates $Y$, given any other set of variates $X$, including the particular case of $Y$ predictand, and $X$ predictors. The uncertainty or variability around such an average is also automatically computed.
\item \emph{Functional regression:} if the predictand $Y$ or any other variate of interest turns out to be a function $f$ of variates $X$, then their conditional probability will be a delta distribution: $\p(Y\|X) = \delt[Y-f(X)]$.
% We must remember that a function can always be represented by a probability distribution, but not vice versa.\footnote{The function $f\colon x \mapsto y=f(x)$ corresponds to the probability density $\p(y\|x) = \delt[y-f(x)]$, where $\delt$ is a delta distribution.}
Thus the \ljm\ always recovers a functional relationship if there is one, as well as its noise distribution.
\end{itemize}
Furthermore, the machine also quantifies the uncertainty of all outputs above. More precisely, it takes into account how the statistical properties of the learning dataset could be different from those of its original population, owing to sampling fluctuations; and it can compute how much any of the outputs above would probably change if more learning data were collected.
% \mynotep{Is the part below superfluous? I think it could be interesting for readers from machine learning}\\
% \mynotep{I, Alexander, think it should be included as the point is very nicely put (and important). }\\
% \mynotep{Luca: moved this to \sect\,\ref{sec:the_machine}}
In the next section we explain intuitively how the \ljm\ computes the general probability distribution over long-run frequencies. A couple of special characteristics brought about by such computation can already be summarized here. First, in contrast to machine-learning algorithms such as neural networks, random forests, Gaussian processes, support-vector machines, or generalized linear models, the \ljm\ does not assume the existence of a function (possibly contaminated by a little noise) from predictors to predictands. This is a very strong assumption, justifiable in the presence of informationally very rich predictors such as images, but otherwise quite unrealistic for many kinds of predictors considered in medicine, especially those that are more readily available and less invasive and, therefore, more desirable. Second, in contrast to algorithms such as neural networks, random forests, support-vector machines, logistic regression, or generalized linear models, the \ljm\ does not do an optimization during the learning phase, searching for the minimum of some objective function. It does a full \emph{hypothesis-space survey}. % Inference and generalization in fact essentially rely on averaging operations in problems such as the present one \citetext{\citealt{definetti1930,definetti1937,dawid2013}; \citealt[\sects~4.2--4.3]{bernardoetal1994_r2000}}
The optimization done by most machine-learning algorithms is an approximate form of this survey, based on the assumption or hope that the most relevant portion of the hypothesis space will be around the extremum \citetext{\citealp[\chap~16]{mackay1992,murphy2012}; \citealp[see also][]{selfetal1987}}. The underlying necessity of a more extensive survey, however, becomes manifest in many of the obligatory procedures that go together with the training of most machine-learning algorithms, cross-validation being a prominent example \citep{mackay1992b}. This leads to a third special characteristic of the \ljm: it does not need validation sets, test sets, or other data splits; nor does it need cross-validation procedures. Intuitively this is the case because the underlying hypothesis-space survey realizes a sort of full-fledged cross-validation and data partition. It can indeed be proven that one of the internal computations of the machine is mathematically equivalent to doing $k$-fold cross-validations for \emph{all possible} data splits and $k$ \citep{portamana2019b,fongetal2020}.
Such flexibility and informationally rich output come, of course, at a computational cost. Until some years ago, the cost would have been prohibitive in all but the simplest inferential problems. But today an inference problem involving 13 variates and 700 datapoints, such as the example considered in the present work, takes less than six hours of computation on an office computer workstation. We discuss computational limitations further in \sect\,\ref{sec:rangeLJM}.
\subsection{Intuitive understanding of the learning algorithm}
\label{sec:the_machine_learning}
The calculations by which the \ljm\ learns and operates are univocally determined by Cox's theorem\footnote{\citealp{cox1946,cox1961,polya1954,polya1954b_r1968,tribus1969,fine1973,rosenkrantz1977,paris1994_r2006,snow1998,halpern1999b,arnborgetal2001,snow2001,claytonetal2017}; see also \citealp{hailperin1996}; and \citealp{vanhorn2003} for a review.}, which yields quantitative inference rules from self-consistency requirements, and by de~Finetti's theorem\footnote{\citealp[\sects~4.2--4.3]{definetti1930,definetti1937,bernardoetal1994_r2000}; for a review see \citealp{dawid2013}.}, which further constrains these rules in the case of \enquote{generalization from similar cases}. These calculations have a very intuitive interpretation.
We consider a patient to be a member of some population of similar past, present, and future patients. Suppose we knew the joint frequency distribution of all possible combinations of predictor and predictand values in such a population. We would then judge the probability for a patient's variate values to be equal to their corresponding population frequency. Pure symmetry considerations lead to this result \citetext{\citealp[Appendix on eduction]{johnson1924}; \citealp[\sects~4.2--4.3]{johnson1932c,definetti1930,dawid2013,bernardoetal1994_r2000}}. The same would be true for conditional and marginal probabilities and frequencies.\footnote{If there were a functional relationship from predictors to predictand, then the predictand value corresponding to the function output would have conditional frequency and probability equal to 1, and all other values having 0. Therefore, this point of view still encompasses a functional relationship as a particular case.} This population frequency distribution would bound the maximal prognostic power attainable with the given predictors in the population. A higher prognostic power could only be attainable by using additional or different predictors having sharper conditional frequencies for the predictand in the population. Given knowledge of such frequency distribution, there would be no problem of \enquote{generalizing} to new patients, because each new patient would already be counted in the known frequencies. An inference algorithm would only need to enumerate and memorize, rather than to learn and generalize.
Learning and generalization come into play because the frequency distribution for the population is unknown: we only have a sample from it, the \enquote{learning dataset}. Thus we can, at most, assign a probability to each possible frequency distribution. This is precisely what the \ljm\ does.
The way in which the machine assigns a probability to each \enquote{candidate} true frequency distribution is also intuitive. It combines two factors: (i) how well the candidate fits the sample dataset, (ii) how biologically or physically reasonable the candidate is. The first factor is easily computed: it is the joint probability of the dataset if it were sampled from a population having that candidate frequency. The second factor is a prior probability expressing how reasonable that candidate is.\footnote{Some notion of \enquote{reasonable candidate} is unavoidable and clearly present in the construction or testing of any inference algorithm. How can we otherwise judge that an algorithm is over- or under-fitting, given that we do not know the ground truth? (if we knew the latter we would not be making inferences.) Such judgement implies that we have a preconceived reasonable reference distribution in the back of our minds. The fit is either qualitatively compared with this reasonable reference; or it is compared with a known ground-truth, which was in turn chosen because of its similarity with the reasonable reference.} The most general natural requirement for \enquote{reasonableness} is that the candidate should have some degree of smoothness, owing to physical and biological reasons. This prior probability prevents overfitting and underfitting; in fact, it actually \emph{defines} mathematically what can be considered \enquote{overfitting} and \enquote{underfitting}. Figure~\ref{fig:prior_distribution} shows samples of what the machine has been programmed to consider \enquote{reasonable candidates} for the population frequency distribution of a discrete variate. This choice can be altered by the clinician. Note that no frequency distributions are excluded; they are only given higher or lower probabilities.
\begin{subfigure}[t]\setcounter{subfigure}{0}
\centering%
\begin{minipage}[c]{0.39\linewidth}\centering
\includegraphics[width=\linewidth]{priorexamples_AVDEL30MIN_neuro.pdf}
\caption{Samples of initially probable candidates of the true population frequency distribution of an integer variate (for example \ravltdel\ or \ravltrec, to be introduced in \sect\,\ref{sec:dataset}).}\label{fig:prior_distribution}
\end{minipage}\hfill
\begin{minipage}[c]{0.59\linewidth}\centering%
\makebox[0.49\linewidth]{\footnotesize dataset}%
\hfill%
\makebox[0.49\linewidth]{\footnotesize high-probability candidate}%
\\[-1em]
\includegraphics[width=0.49\linewidth]{exampledistr_sample_all.pdf}%
\hfill%
\includegraphics[width=0.49\linewidth]{exampledistr_okish_all.pdf}%
\\
\makebox[0.49\linewidth]{\footnotesize low-probability cand. (poor fit)}%
\hfill%
\makebox[0.49\linewidth]{\footnotesize low-probability cand. (unreasonable)}%
\\[-1em]
\includegraphics[width=0.49\linewidth]{exampledistr_unlikely_all.pdf}
\hfill
\includegraphics[width=0.49\linewidth]{exampledistr_strange_all.pdf}
\caption{Illustration of the two factors determining the final probability of a candidate population-frequency distribution represented as a \textcolor{bluepurple}{blue scatterplot}. \emph{Upper-left}: an example dataset (\textcolor{yellow}{yellow points}) with two variates. \emph{Upper-right}: candidate frequency distribution with high final probability; it fits the data and is reasonable. \emph{Lower-left}: candidate distribution with low final probability; it is reasonable but does not fit the data. \emph{Lower-right}: candidate distribution with low final probability; it fits the data but is not reasonable.}\label{fig:inferring_distribution}
\end{minipage}
\end{subfigure}%
The product of the two factors (i), (ii), normalized, yields the probability of each possible frequency distribution. An illustration of factors (i), (ii) at work is given in \fig~\ref{fig:inferring_distribution} for an example problem with two variates. The \ljm\ outputs the distribution of these final probabilities in the form of a large sample (the amount is decided by the user) drawn from it. In this form, all other marginal or conditional probabilities and averages of interest are calculated via Monte Carlo integration. This methodology has been successfully demonstrated for \ad\ with a smaller number of predictors \citep{antonianovillalobosetal2014}, and it is the same, but in nonparametric form, used in various inferences about the black holes M87 and Sagittarius~A\textsuperscript{*} \citep{eht2019,eht2022}. Further mathematical and computational details are given in appendix~\ref{sec:appendices}.
We close this section emphasizing that the inferential steps of the machine, from input to output, consist of, \emph{literally}, no more than the reiterated application of just three inductive-logic rules:
\begin{equation*}
\begin{gathered}
\p(\textrm{not-}A \| I) = 1-\p(A\|I)\\
\p(A\mathbin{\textrm{or}}B \|I) =
\p(A \|I) + \p(B \|I) -
\p(A\mathbin{\textrm{and}}B \|I) \\
\p(A\mathbin{\textrm{and}}B \|I) =
\p(A \| B\mathbin{\textrm{and}} I) \,
\p(B \|I) \ .
\end{gathered}
\end{equation*}
\bigskip% Newpage break just to help while writing the draft
\section{Example application}
\label{sec:application}
\begin{table}[b]\centering
% \begin{minipage}{0.75\linewidth}
\begin{framed}
\small
\caption{\small\bf Main inferential and decision-making steps}\label{tab:main_steps}
\begin{enumerate}\itemsep1ex
\setcounter{enumi}{-1}
\item\label{item:learn} Find or build an appropriate dataset of clinical cases comprising values of the predictors and predictand of interest. Datapoints with partially missing values are allowed.
Input the dataset into the \ljm\ and let it infer the joint full-population frequencies of predictors and predictand underlying the dataset.
\item\label{item:predictors} Measure the present patient's predictor values and input them in the \ljm. Partially missing values are allowed.
\item\label{item:population} Assess which conditional statistics of the dataset can be applied to the present patient, and any auxiliary clinical information available. Quantify the latter in a prior probability.
Input the relevant statistics and auxiliary information for the present patient into the \ljm.
Upon request, the machine can now output the final probability of the predictand's true value for the patient, as well as any other probabilities and likelihoods of interest.
\item\label{item:utilities} Assess the clinical courses of action (treatments, more tests, and so on) available for the present patient, and the utility (benefit and loss) of each course of action, depending on each possible predictand value for the patient.
Input the patient's utilities into the \ljm. The machine outputs the course of action having maximal expected utility.
Upon request, the machine can output the probability of gaining different utilities, perform sensitivity analyses for missing data, and do other similar tasks.
\end{enumerate}
\end{framed}
% \end{minipage}
\end{table}
In this section we illustrate how the \ljm\ is applied in the example case outlined in \sect\,\ref{sec:four_patients}. Although the patients are fictitious, the dataset is real and briefly discussed in the next subsection. The main inferential and decision-making steps are summarized in table~\ref{tab:main_steps}.
% As mentioned in \sect\,\ref{sec:intro_purposes},
Steps 1.--3. are modular: the clinician is free to stop after any of them and use their output in other ways or with other algorithms.
These steps are illustrated in the next three subsections, preceded by an explanation of their rationale. They are presented in chronological order as the clinician would apply them. % In each section, a general overview and discussion of the theory and method behind the specific step is first given, followed by the concrete application to our example case.
Steps 1.--3. could also be presented in reverse order; which would be more suited to their logical dependence, as the procedure in each step is actually motivated by the one in the next. We suggest that readers familiar with the principles of clinical decision-making read the following subsections in \ref{sec:dataset}--\ref{sec:predictor_step}--\ref{sec:population_step}--\ref{sec:utilities_step} order; whereas readers unfamiliar with these principles read them in \ref{sec:dataset}--\ref{sec:utilities_step}--\ref{sec:population_step}--\ref{sec:predictor_step} order.
\setcounter{subsection}{-1}
\subsection{Predictors, predictand, and learning dataset}
\label{sec:dataset}
The dataset used in our example comes from the study by the Alzheimer's Disease Neuroimaging Initiative (\adni).\footnote{\url{http://adni.loni.usc.edu}. A complete listing of ADNI investigators can be found at \url{http://adni.loni.usc.edu/wp-content/uploads/how\_to\_apply/ADNI\_Acknowledgement\_List.pdf}.} This longitudinal multicentre study is designed to develop and validate neuroimaging and biochemical biomarkers for the early detection, monitoring, and treatment of \ad\ \citep{petersenetal2010}. The present dataset consists of 704 \adni\ subjects constrained, according to \adni\ criteria, to be between 55 and 90 years old. These subjects were chosen to meet the criteria for \mci\ at their first, baseline assessment, and to have a minimum of two additional following study visits and three \textsc{mri} examinations. Each subject's diagnostic status was reevaluated at each study visit. This longitudinal diagnostic label is used as the predictand variate $\cad$ in our study; it categorizes each subject as either converting to \ad\ after the first study visit: $\cad\mo\yes$, or remaining stable with \mci: $\cad\mo\no$. The dataset has 325 subjects (46.2\%) with $\cad\mo\yes$ and 379 (53.8\%) with $\cad\mo\no$. Criteria used for classifying subjects as having \mci\ or \ad, as well as \adni's general criteria for subject inclusion, are described in \citet{mckhannetal1984,petersenetal2010}.
% The 12 predictor variates consist of the results from seven cognitive-test measures: a reading test (\anart), a word fluency test (\cft), tests of executive function (\tmta, \tmtb), the immediate-memory, delayed-recall and recognition-subsets of memory function (\ravltimm, \ravltdel, \ravltrec); an evaluation of depression (\gds); the presence of the \texttt{APOE}-e4 risk allele \citep{liuetal2013}; a normalized measure of the sum of left and right hippocampal volume (\hv); \age; \sex. Further details about these variates and their selection can be found in \citet{ryeetal2022}. The cognitive and \gds\ variates are integer-valued, hippocampal volume and \age\ are continuous, and \apoe\ and \sex\ are binary. The values of one or two of these predictors were missing for 30 subjects in the dataset.
The 12 predictor variates consist of the results from seven cognitive-test measures: a reading test (\anart), a word category fluency test (\cft), trail-making tests of executive function (\tmta, \tmtb), the immediate-memory, delayed-recall and recognition-subsets of memory function (\ravltimm, \ravltdel, \ravltrec); a geriatric depression scale (\gds); the presence of the \texttt{APOE}-e4 risk allele \citep{liuetal2013}; a normalized measure of the sum of left and right hippocampal volume (\hv); \age; \sex. Further details about these variates and their selection can be found in \citet{ryeetal2022}. The cognitive and \gds\ variates are integer-valued, hippocampal volume and \age\ are continuous, and \apoe\ and \sex\ are binary. The values of one or two of these predictors were missing for 30 subjects in the dataset.
The \ljm\ took less than five hours (on a 16-core Intel Core i9-12900K CPU) to calculate the probability distribution for the possible joint population-frequency distributions of the 13 variates.
Some results can already be visualized after this inference. Figure~\ref{fig:marginal_pop_distributions} shows, on the left, the inferred distributions of \ravltdel, \ravltimm, \gds, and hippocampal volume for the subpopulation of patients that will convert to \ad\ (\textcolor{red}{red}) and the subpopulation that will remain with stable \mci\ (\textcolor{blue}{blue}). On the right, the inferred frequency of conversion in the full population is plotted (\textcolor{grey}{grey}), conditional on the same predictors. The thin curves are 100 samples of highly probable population-frequency distributions; the thicker lines are their means, which are also the predictive conditional probabilities.
\begin{figure}[!t]
\centering%
\includegraphics[width=0.43\linewidth]{population_distr_scat_RAVLT-del.pdf}%
\qquad%
\includegraphics[width=0.43\linewidth]{prob_conversion_RAVLT-del.pdf}%
\\
\includegraphics[width=0.43\linewidth]{population_distr_scat_RAVLT-imm.pdf}%
\qquad%
\includegraphics[width=0.43\linewidth]{prob_conversion_RAVLT-imm.pdf}%
\\
\includegraphics[width=0.43\linewidth]{population_distr_scat_GDS.pdf}%
\qquad%
\includegraphics[width=0.43\linewidth]{prob_conversion_GDS.pdf}%
\\
\includegraphics[width=0.43\linewidth]{population_distr_scat_HV.pdf}%
\qquad%
\includegraphics[width=0.43\linewidth]{prob_conversion_HV.pdf}%
\caption{Inferred distributions of some predictor variates, for the subpopulation of patients that will convert to \ad\ (\textcolor{redpurple}{red dashed}) and the subpopulation with stable \mci\ (\textcolor{bluepurple}{solid blue}).}\label{fig:marginal_pop_distributions}
\end{figure}%
%
The two subpopulations of patients are clearly distinct in the \ravltdel, \ravltimm, \hv\ variates. These predictors can yield probabilities of conversion as high as 70\% or as low as 10\%. The two subpopulations are practically indistinguishable in the \gds\ variate, which, therefore, always gives very uncertain predictions.
The learning dataset comprises enough data to greatly reduce our uncertainty about the population distributions, as evident from the very narrow spread of the curves.
In fact it leads to identical answers, within numerical-computation error, even if we drastically change the prior illustrated in \fig~\ref{fig:prior_distribution}, for example favouring more unimodal distributions or more multimodal distributions.
These simple results show the great usefulness of the \ljm\ for general medical research.
\subsection{Patient's clinical information}
\label{sec:predictor_step}
The 12 predictor values for our four patients are reported in table~\ref{tab:patients_data}, top. Note that Curtis's value for the Hippocampal Volume is missing; this is not a problem for the \ljm. Given these predictor values the \ljm\ can output any probabilities of interest to the clinician. Table~\ref{tab:patients_data}, bottom, reports three probabilities that are important for the step of the next subsection:\footnote{All relative uncertainties of the results caused by numerical computation error are below 0.8\%, Curtis's two likelihoods being an exception at 2\%.}
% (owing to the illustrative character of this example, we do not fully follow the standards for the expression of measurement uncertainty \citep{jcgm1993_r2008})
\begin{itemize}
\item $\p(\cad\mo\yes \| \predictors)$: the probability that the patient will convert to \ad, given the patient's specific predictors and that the patient comes from the same population as the learning dataset.
\item $\p(\predictors \| \cad\mo\yes)$: the probability that a patient who will convert to \ad\ would have these specific predictor values. In other words, the \emph{likelihood}\footnote{$\p(A\|B)$ is the probability of $A$ given $B$, as well as the likelihood of $B$ given $A$ \citep[\sect\,6.1]{good1950}.} of conversion to \ad, given the predictors.
\item $\p(\predictors \| \cad\mo\no)$: the probability that a patient who will remain with stable \mci\ would have these specific predictor values. In other words, the \emph{likelihood} of stable \mci, given the predictors.
\end{itemize}
\begin{table}[t]
\centering
\begin{tabular}[b]{lcccc}
\hline\\[-1.5\jot]
&{\small Olivia} &{\small Ariel} &{\small Bianca} &{\small Curtis}
\\[2\jot]
\age&75.4&75.4&75.4&63.8 \\
\sex&F&F&F&M \\
\hv${}/10^{-3}$&4.26&4.26&4.26&[missing] \\
\apoe&\no&\no&\no&\yes \\
\anart&18&18&18&15 \\
\cft&21&21&21&14 \\
\gds&3&3&3&2 \\
\ravltimm &36&36&36&20 \\
\ravltdel &5&5&5&0 \\
\ravltrec &10&10&10&3 \\
\tmta&21&21&21&36 \\
\tmtb&114&114&114&126
\\[\jot]
\hline\\
{\small $\p(\cad\mo\yes \| \predictors)$}&
0.302&0.302&0.302&0.703
\\
{\small $\p(\predictors \| \cad\mo\yes)/10^{-12}$}&
8.97&8.97&8.97&1.14
\\
{\small $\p(\predictors \| \cad\mo\no)/10^{-12}$}&
18.6&18.6&18.6&0.343
\\[\jot]
\hline
\end{tabular}\hfill
\caption{Predictor values for the four patients (see \sect\,~\ref{sec:dataset}), and resulting conditional probabilities.}\label{tab:patients_data}
\end{table}
% \begin{table}[!b]
% \centering
% \begin{tabular}{lcccc}
% \hline\\[-1.5\jot]
% &{\small Olivia} &{\small Ariel} &{\small Bianca} &{\small Curtis}
% \\[\jot]
% {\small $\p(\cad\mo\yes \| \predictors)$}&
% 0.302&0.302&0.302&0.703
% \\
% {\small $\p(\predictors \| \cad\mo\yes)/10^{-12}$}&
% 8.97&8.97&8.97&1.14
% \\
% {\small $\p(\predictors \| \cad\mo\no)/10^{-12}$}&
% 18.6&18.6&18.6&0.343
% \\[\jot]
% \hline
% \end{tabular}
% \caption{\mynotep{Probabilities computed by the \ljm}} \label{tab:prob_likelihoods_patients}
% \end{table}
\begin{subfigure}[t]\setcounter{subfigure}{0}
\centering%
\begin{minipage}[t]{0.49\linewidth}\centering
\includegraphics[width=\linewidth]{curtis_distr_HV.pdf}
\caption{Probability distribution for Curtis's Hippocampal Volume (\textcolor{green}{green}). The full-population distribution (\textcolor{grey}{dashed grey}) is also plotted for reference.}\label{fig:curtis_HV}
\end{minipage}\hfill%
\begin{minipage}[t]{0.49\linewidth}\centering
\includegraphics[width=\linewidth]{directprob_olivia.pdf}
% \\ \footnotesize Olivia, Ariel, Bianca
\caption{Probability distribution for the frequency of conversion to \AD\ in the subpopulation having Olivia's predictors. The \textcolor{red}{red vertical line} is the value of the probability $\p(\cad\mo\yes \| \predictors)$.}\label{fig:freq_distribution_patients}
\end{minipage}
% \hfill
% \begin{minipage}{0.49\linewidth}\centering
% \includegraphics[width=\linewidth]{directprob_curtis.pdf}\\
% \footnotesize Curtis
% \end{minipage}
\end{subfigure}%
The \ljm\ can also answers other questions of interest to the clinician. For instance, what could be the value of Curtis's Hippocampal Volume? The answer is given in \fig~\ref{fig:curtis_HV}, which also shows the full-population distribution as comparison (dashed grey); with 95\% probability Curtis's value is between 2.8 and 5.3, with a median of 3.8. And what is the frequency of conversion to \ad\ among the subpopulation having Olivia's, Ariel's, or Bianca's predictors? The answer is given in the histogram of \fig~\ref{fig:freq_distribution_patients}: with 95\% probability, the fraction of this subpopulation that eventually converts to \ad\ is between 0.19 and 0.43; this uncertainty range is due to the limited size of the learning dataset. The probability $\p(\cad\mo\yes \| \predictors)$ is equal to the average of such a distribution \citep[\eg][\sects~4.2--4.3]{bernardoetal1994_r2000}, provided the patient and dataset can be considered as belonging to the same population.
%%%% MI given all minus ...
% AVDEL30MIN_neuro RAVLT_immediate TRABSCOR_neuro AVDELTOT_neuro LRHHC_n_long
% mean 0.125 0.131 0.135 0.137 0.138
% sd 0.004 0.004 0.004 0.004 0.004
% TRAASCOR_neuro CATANIMSC_neuro AGE GDTOTAL_gds ANARTERR_neuro Gender_num_
% mean 0.139 0.139 0.139 0.140 0.140 0.140
% sd 0.004 0.004 0.004 0.004 0.004 0.004
% Apoe4_ all
% mean 0.140 0.140
% sd 0.004 0.004
%%%% MI differences:
% all Apoe4_ Gender_num_ ANARTERR_neuro GDTOTAL_gds AGE CATANIMSC_neuro
% mean 0.000 0.0000121 0.0000939 0.000264 0.000295 0.000512 0.000561
% sd 0.009 0.0090000 0.0090000 0.009000 0.009000 0.009000 0.009000
% TRAASCOR_neuro LRHHC_n_long AVDELTOT_neuro TRABSCOR_neuro RAVLT_immediate
% mean 0.00113 0.00187 0.00234 0.00438 0.00905
% sd 0.00900 0.00900 0.00900 0.00900 0.00800
% AVDEL30MIN_neuro
% mean 0.0144
% sd 0.0080
\bigskip% Newpage break just to help while writing the draft
\subsection{Assessment of relevant subpopulation and auxiliary information}
\label{sec:population_step}
\subsubsection{Rationale}
\label{sec:population_step_rationale}
As already mentioned, and as will be argued more concretely in the next section, the clinician needs a probability in order to choose a treatment or other course of action for the current patient. This probability is computed by generalizing associations between predictors and predictand hidden in a dataset of similar patients, as discussed in \sect\,\ref{sec:the_machine}. The way this generalization is made, however, can differ from patient to patient in two respects:
\begin{itemize}
\item Only some particular directed associations can be generalized to the current patient, whereas others would be inappropriate to generalize. In some cases, for example when the learning dataset is artificially assembled with balancing or stratification methods, some associations cannot be generalized to any patients at all.
\item There can be additional information available for the current patient, for instance some clinical predictors not included in the learning dataset, or other \enquote{softer} information such as family history or geographic background.
\end{itemize}
There is no sharp separation between these two items. The presence of additional information often automatically implies that some associations cannot be generalized from the learning dataset to the current patient.
\vbox{\setlength{\intextsep}{0ex}% with wrapfigure
\setlength{\columnsep}{1ex}% with wrapfigure
\begin{wrapfigure}{r}{0.25\linewidth}% with wrapfigure
%\vspace{-1ex}%
\includegraphics[width=\linewidth]{baseratefallacy3.png}%
% \includegraphics[width=\linewidth]{baseratetrials.png}\\[\jot]
% \includegraphics[width=\linewidth]{baseratepopulation.png}
\end{wrapfigure}%
Let us explain with a familiar example why particular associations cannot be generalized. Most students of medicine
%medicine students
learn about the \emph{base-rate fallacy} \citep{barhillel1980,jennyetal2018,sprengeretal2021,matthews1996}. Consider a large set of clinical trials, illustrated in the upper table on the side, where each dot represents, say, 10\,000 patients. In this sample dataset it is found that, among patients having a particular value \enquote{+} of some predictors (left column), 71.4\% of them (or 5/7, upper square) eventually developed a disease. The fallacy lies in judging that a new real patient from the full population, who has predictor value \enquote{+}, also has a 71.4\% probability of developing that disease. In fact, \emph{this probability will in general be different}. In our example, it is 33.3\% (5/15), as can be seen in the lower table illustrating the full population. This difference would be noticed as soon as the inappropriate probability was used to make prognoses in the full population. A similar situation happens for the predictor value \enquote{$-$}.
There is a discrepancy in the conditional frequencies of predictand given predictors, between the sample dataset and the full population, because the proportion of positive vs negative disease cases in the latter has some value, 16.7\%/83.3\% in our example, whereas the samples for the trials (dashed line in the lower table) were hand-chosen so as to have a 50\%/50\% proportion. This sampling procedure is called \enquote{class balancing} in machine learning \citep{provost2000,drummondetal2005,weissetal2003}. More generally this discrepancy can appear whenever a population and a sample dataset from it do not have the same frequency distribution for the predictand. In this case, we cannot rely on the probabilities of \enquote{predictand given predictors} obtained from the sample dataset, which we symbolically write as}
\begin{equation}
\p(\predictand \| \predictors, \dataset)
\end{equation}
% \mynotew{Maybe make clear at the beginning that we denote predictand with $Y$, predictors with $X$, dataset with $D$, and just write $\p(Y\|X\,D)$?}\\
% \mynotew{I (Alexander) suggest keeping it as it is here but introducing X,Y,D before the equations below.}\\
% \mynotew{[Luca]: agreed! :)}
A little counting in the side figure reveals, however, that other frequencies may be relied upon. Consider the full population. Among all patients who developed the disease, 83.3\% of them (or 5/6, upper row) had the predictor value \enquote{+}, while among those who did not develop the disease, 33.3\% (or 1/3, lower row) had the predictor value \enquote{$-$}. \emph{And these frequencies are the same in the sample dataset}. These frequencies from the clinical trials can therefore be used to make a prognosis using Bayes's theorem. For brevity, denote the predictors by $X$, the predictand by $Y$, the dataset or trials by $D$, and the full-population base rate by $B$. Bayes's theorem yields
\begin{equation}
\label{eq:base-rate_correction}
\p(Y \| X, D, B) =
\frac{ \p(X \| Y, D) \cdot \p(Y \| B)
}{\sum\limits_{Y} \p(X \| Y, D) \cdot \p(Y \| B)
}
\end{equation}
In our example we find
\begin{equation}
\label{eq:base-rate_correction_example}
\begin{split}
\p(Y\mo\mathord{+} \| X\mo\mathord{+}, D, B)
&=
\frac{
\p(X\mo\mathord{+} \| Y\mo\mathord{+}, D)
\cdot \p(Y\mo\mathord{+} \| B)
}{
% \biggl[\begin{aligned} &
\p(X\mo\mathord{+} \| Y\mo\mathord{+}, D)
\cdot \p(Y\mo\mathord{+} \| B)
%+{}\\[-0.5\jot]&\hspace{5em}
\p(X\mo\mathord{+} \| Y\mo\mathord{-}, D)
\cdot \p(Y\mo\mathord{-} \| B)
%\end{aligned}\biggr]
} \\[2\jot]
&\approx
\frac{ 0.833 \cdot 0.167}{0.833 \cdot 0.167 + 0.333 \cdot 0.833}
= 0.33
\end{split}
\end{equation}
which is indeed the correct full-population frequency.
If the samples of the clinical trials had been chosen with the same frequencies as the full population (no \enquote{class balancing}), then the probability $\p(\predictand \| \predictors, \dataset)$ from the dataset would be the appropriate one to use. But the probabilities $\p(\predictors \| \predictand, \dataset)$ together with Bayes's theorem as in \eqn~\eqref{eq:base-rate_correction} would also lead to exactly the same probability. We thus see that \emph{using the probabilities}
\[\p(\predictors \| \predictand, \dataset)\]
\emph{from the dataset is preferable to using} $\p(\predictand \| \predictors, \dataset)$. The former yield the same results as the latter when use of the latter is appropriate, and allow us to apply corrections when use of the latter is inappropriate. The superiority of using $\p(\predictors \| \predictand, \dataset)$ probabilities (called \enquote{generative} in machine learning, see \eg\ \citealp[\sect\,8.6]{murphy2012}) is illustrated with a toy example in table~\ref{tab:superiority_predictors_given_predictand}.
\begin{table}[b]
\begin{framed}
\small
\caption{\small\bf superiority of the \enquote{predictors$\pmb{\|}$predictand} (or \enquote{generative}) approach}
\label{tab:superiority_predictors_given_predictand}
\mbox{}\\
We split our learning dataset into two subsets:
\begin{itemize}
\item One with 361 subjects and a ratio of 29.9\%/70.1\% of subjects with $\cad\mo\yes$ vs $\cad\mo\no$.
\item One with 343 subjects and a ratio of 63.3\%/36.7\% of subjects with $\cad\mo\yes$ vs $\cad\mo\no$. This subset is used as a fictive full population.
\end{itemize}
This partition was made with no systematic sampling of any variates except the predictand \cad.
\smallskip After training on the learning dataset, we make a prognosis for each of the 343 \enquote{new} patients, through four separate approaches: (a) using the probabilities $\p(\predictand \| \predictors, \dataset)$, as typical of machine-learning algorithms; (b) using $\p(\predictors \| \predictand, \dataset)$ together with the base rate, as explained above; (c) tossing a coin; (d) always prognosing \enquote{$\cad\mo\yes$}, which guarantees 63.3\% correct prognoses owing to the base rate of the full population. Finally, the accuracies (number of prognoses giving more than 50\% probability to the correct outcome) of these four approaches are calculated. Here are the results from lowest to highest:
\medskip
{
% \begin{table}[!h]
\centering
\begin{tabular}{cccc}
\hline\\[-\jot]
{\scriptsize predictand$\|$predictors}&{\scriptsize coin toss}&{\scriptsize always predict conversion}&{\scriptsize predictors$\|$predictand \amp\ base\,rate}
\\[1\jot]
37.3\% & 50\% & 63.3\% & 73.2\%\\[\jot]
\hline
\end{tabular}
% \end{table}
}
\medskip
The \enquote{predictand$\|$predictors} approach (\enquote{discriminative} in machine-learning parlance) leads to worse results than a coin toss because of its underlying base-rate fallacy. The \enquote{predictors$\|$predictand} approach (\enquote{generative} in machine-learning parlance) leads to better results than simply always prognosing the most common base-rate outcome; this shows that the dataset can still provide useful statistical information despite its mismatched base rate. Inference algorithms that only yield \enquote{predictand$\|$predictors} outputs, unlike the \ljm, are incapable of extracting this useful information.
\end{framed}
\end{table}
\medskip
The use of dataset probabilities different from $\p(\predictand \| \predictors, \dataset)$ can be necessary even when the dataset has statistics identical with the population it is sampled from. Typical cases are the prognosis of a patient that comes from a peculiar subpopulation or even from a different population \citetext{\citealt{lindleyetal1981}; \citealt{quintanaetal2017}; \citealt[\chap~4]{soxetal1988_r2013}; \citealt[\chap~5]{huninketal2001_r2014}}. For instance, the first case happens when the clinician has additional information not included among the predictor variates, such as the result of an additional clinical test, or family history; the second case happens when the patient comes from a different geographical region. There is of course no sharp distinction between these two cases.
What is important is that, in either case, it can still be possible to use statistical information from the sample dataset to make prognoses. It is sufficient that some \emph{conditional} statistics may be applicable to the specific patient. For a patient coming from a different region, for example, it may be judged that the conditional probabilities $\p(\predictand \| \predictors, \dataset)$ still apply. In other words, the patient may still be considered a member of the subpopulation having those specific predictor values. Using more technical language we say that a new patient can be considered \emph{exchangeable} with the patients constituting the dataset, but only conditional on particular variates. See Lindley \citetext{\citeyear[especially around \sects~7.3, 8.6]{lindley2006_r2014}; \citeyear{lindleyetal1981}} for a clear and logically impeccable presentation not obscured by technical language \citetext{more technical references are \citealt[\sects~4.2--4.3, 4.6]{definetti1930,definetti1937,dawid2013,bernardoetal1994_r2000}; see also \citealt{malinasetal2004_r2016}, \citealt{sprengeretal2021} about confounding and Simpson's paradox, to which this topic is tightly related}.
This topic is complex and of extreme importance for inference, but its detailed study is not the goal of the present work. Our main point here is that population variability and auxiliary clinical information are important factors that differentiate patients, and a personalized approach ought to take them into account. The method here presented does this naturally, allowing a great flexibility in selecting which statistical features of the sample dataset should be used for each new patient, and the integration of auxiliary clinical information in the form of a prior probability. As discussed in \sect\,\ref{sec:predictor_step}, the \ljm\ allows us to quickly calculate conditional probabilities $\p(Y\|X, \dataset)$ for any desired variate subsets $Y$ and $X$ required by the patient's relevant population.
\subsubsection{Application to the example study}
\label{sec:posterior_application}
In our example of \sect\,\ref{sec:four_patients}, all statistics of the dataset are considered relevant for Olivia, Bianca, and Curtis. For these patients the clinician can therefore use Bayes's theorem with the likelihoods of table~\ref{tab:patients_data} and the dataset conversion rate of $0.463$ -- or equivalently directly the probabilities $\p(\cad\mo\yes \| \predictors, \dataset)$ provided in the same table.
For Ariel, however, the clinician judges that a different base rate or prior probability of conversion should be used, equal to 65\%, because of her different geographical origin and family history. In her case the clinician uses Bayes's theorem with the likelihoods of table~\ref{tab:patients_data} and the prior probability of $0.65$.
The final probabilities of conversion to \ad\ for our four patients are reported in table~\ref{tab:posterior_patients}. Note how the final probability for Ariel is higher than that for Olivia and Bianca, even if the predictor data are the same for these three patients.
\medskip
\begin{table}[!h]
\centering
\begin{tabular}{lcccc}
\hline\\[-1.5\jot]
&{\small Olivia} &{\small Ariel} &{\small Bianca} &{\small Curtis}
\\[\jot]
{\small initial probability\; $\p(\cad\mo\yes \| \auxinfo)$}&
0.463&0.65&0.463&0.463
\\[\jot]
{\small final probability\; $\p(\cad\mo\yes \| \predictors, \dataset, \auxinfo)$}&
0.302&0.47&0.302&0.703
\\[\jot]
\hline
\end{tabular}
\caption{Final probabilities of conversion computed from dataset and auxiliary information}\label{tab:posterior_patients}
\end{table}
\bigskip% Newpage break just to help while writing the draft
\subsection{Assessments of treatments and benefits; final decision}
\label{sec:utilities_step}
\subsubsection{Rationale}
\label{sec:expected_utility_theory}
A crucial point in clinical decision-making is this: the clinician needs to assess, not the presence (present or future) of a disease, but the \emph{risk} of its presence. Is there a difference between these two problems? and why is the difference important?
In clinical practice, we can rarely diagnose or prognose a medical condition with full certainty. Perfect classification is therefore impossible. But also a \enquote{most probable} classification, which may be enough in other contexts, is inadequate in clinical ones. The problem is that the clinician has to decide among different courses of action, such as different treatments, more tests, and so on, and the optimal one depends on \emph{how probable} the medical condition is, not just on whether it is more probable than not.
Two examples illustrate this point. Suppose there is a dangerous treatment that extends the patient's lifetime by 1 year if the disease is on its course, but shortens the patient's lifetime by 5 years if the disease is not present. Also suppose that some algorithm tells the clinician whether the disease's presence is \enquote{more probable than not}, given some predictor values; in which case the clinician administers the dangerous treatment. It turns out that 60 out of 100 treated patients having these same predictor values eventually develop the disease, so \enquote{more probable than not} is correct. However, the final result is that the clinician has added $1 \times 60 = 60$ years but also \emph{subtracted $\mathit{5 \times 40 = 240}$ years} from the combined lifespans of the treated patients! The conclusion is that the treatment cannot be prescribed just because the disease is \enquote{more probably present than not}. As an opposite example, suppose that a less dangerous treatment extends the patient's lifespan by five years if the disease is on its course, but shortens it by one month if the disease is not present. In this case, it may be advisable to undergo the treatment even if the disease is \emph{less} probably present than not. If the clinician administer the treatment to 100 similar patients, and 20 of them develop the disease, then the clinician has added $5 \times 20=100$ and subtracted $\tfrac{1}{12} \times 60=5$ years to their combined lifespans.
In both examples, it is clearly important to assess the \emph{probability} -- having precise connections with the population frequency -- that the patient will develop the disease. In the first example, the treatment should only be administered if the probability is higher than 83.3\%; in the second, it can be administered if the probability is at least 1.6\%. The \ljm, as explained in the previous sections, tells the clinician the specific probability for the current patient.
But the choice between treatments depends not only on the probability of the medical condition. Here is where differences between patients vary and matter the most. Consider again the second example above, about the less dangerous treatment. Let us add that the treatment would extend the lifespan by five years, but would also somewhat worsen the quality of life of the patient and of the patient's family. Suppose our patient is quite old and tired, has had a happy life, and is now looking with a peaceful mind towards death as a natural part of life. Such a patient may prefer to forego the bother of the treatment and the additional five years, even if the probability for the disease is quite high.
The benefits of the different treatments, and the probability thresholds at which one treatment becomes preferable to another, must therefore be judged and quantified primarily by the patient. Utility theory and maximization of expected utility allow clinician and patient to make such judgements and decisions in a coherent way \citetext{\citealt{soxetal1988_r2013,huninketal2001_r2014}; see also the clear and charming exposition by \citealt{lindley1971_r1988}, and \citealt{ohaganetal2006}}.
We summarize the main, patient-dependent procedure for decision-making, and show how our computations so far fit perfectly with it.
The clinician first assesses and lists the mutually exclusive courses of action available for the specific patient. These could be preventive or curative treatments, more tests, doing nothing, and so on. Often there are \emph{sequences} of decisions available, but the utility framework can be applied to them as well \citep[see references above and][]{raiffa1968_r1970}. In the present work we are calling these heterogeneous alternatives simply \enquote{treatments} for simplicity (see footnote~\ref{fn:treatment}, p.~\pageref{fn:treatment}). The list treatments is already patient-dependent: some alternatives may not be medically suitable (say, owing to allergies or other clinical conditions), some may be economically too costly, and so on.
Each treatment will have different consequences, which additionally depend on the patient's unknown clinical condition of interest. A treatment may have some consequences if the patient has or will develop the disease, and different consequences otherwise. The patient quantifies, with the clinician's guidance, the benefits and costs -- technically called \enquote{utilities} -- of such possible consequences. The quantification of utilities is not within the scope of the present work. The references cited above offer guidelines and rules for numerically translating factors such as quality of life and expected lifespan into utilities.
The treatments, uncertain clinical conditions, and the quantified utilities $U$ of their consequences can be organized into a table of this form:
\begin{center}
\begin{tabular}{cccc}
&{\small clinical condition $a$}&{\small clinical condition $b$}&{\small \ldots}
\\[2\jot]
{\small treatment $\alpha$} & $U_{\alpha a}$ & $U_{\alpha b}$ &$\dotso$ \\[\jot]
{\small treatment $\beta$} & $U_{\beta a}$ & $U_{\beta b}$ &$\dotso$ \\[\jot]
{\small \ldots} &$\dotso$&$\dotso$&$\dotso$
\end{tabular}
\end{center}
which can be compactly represented by a so-called \emph{utility matrix} $\bigl(U_{ij})$, the row index $i$ enumerating the treatments, and the column index $j$ the clinical conditions. Note that the number of possible treatments and clinical conditions do not need to be equal; generally, they are not.
The \emph{expected utility} $\eU_{i}$ of a treatment $i$ is calculated as the expectation of its utilities $U_{ia}, U_{ib}, \dotsc$ with respect to the probabilities $\p(a), \p(b), \dotsc$ of the clinical conditions $a,b,\dotsc$:
\begin{equation}
\label{eq:def_expected_utility}
\eU_{i} \defd U_{ia}\, \p(a) + U_{ib}\, \p(b) + \dotsb
\end{equation}
Note that this corresponds to a matrix multiplication between the matrix of utilities and the vector of probabilities.
Finally, the recommended treatment is the one having \emph{maximal expected utility}.
% \mynotep{Add a couple of comments about the inevitability of the rules of decision theory \citep{lindley1971_r1988}}
\subsubsection{Application to the example study}
\label{sec:expected_utility_application}
At present there are no cures for \ad, although some recent pharmacological agents are shown to delay onset of pathology related to \ad%
%to extend the time before a patient is cognitively severely impaired
\footnote{\eg\ lecanemab, a monoclonal antibody infusion given every two weeks, targeting amyloid beta plaques; see \url{https://www.fda.gov/news-events/press-announcements/fda-grants-accelerated-approval-alzheimers-disease-treatment}.}. But for the sake of our case study let us imagine that in the near future there are three mutually exclusive treatment options for prevention or retardation of the disease; call them $\beta$, $\gamma$, $\delta$, the simple option of \enquote{no treatment} being denoted by $\alpha$. The clinical conditions to be considered are just two: the patient will convert to \ad\ ($\cad\mo\yes$), or will remain with stable \mci\ ($\cad\mo\no$).
We have therefore $4 \times 2 = 8$ possible consequences of the four treatments depending on the two clinical conditions. Our four patients and the clinician quantify the utilities, arriving at the utility matrices shown in table~\ref{tab:utilities_patients}, top. Olivia, Ariel, and Curtis quantify the benefits of the treatments in exactly the same way, but Bianca's quantification differs slightly, because of the interaction of the treatments with several allergies and additional clinical conditions, as explained in \sect\,\ref{sec:four_patients}.
\medskip
\begin{table}[!h]
\centering
\textit{Utility matrices}\\
\begin{tabular}{lccccccc}
\hline\\[-1.5\jot]
&{\small Olivia} &&{\small Ariel} &&{\small Bianca} &&{\small Curtis}
\\[\jot]
$\begin{matrix}&\\[1.5\jot]
\text{treatment }\alpha\\
\text{treatment }\beta\\
\text{treatment }\gamma\\
\text{treatment }\delta
\end{matrix}$
&
$
\begin{gathered}
{\scriptstyle\cad}\\[-2\jot]
\begin{smallmatrix}
\no\enspace &\enspace\yes
\end{smallmatrix}\\[-\jot]
\begin{bmatrix}10&0\\9&3\\8&5\\0&10\end{bmatrix}
\end{gathered}
$
&&
$\begin{gathered}
{\scriptstyle\cad}\\[-2\jot]
\begin{smallmatrix}
\no\enspace &\enspace\yes
\end{smallmatrix}\\[-\jot]
\begin{bmatrix}10&0\\9&3\\8&5\\0&10\end{bmatrix}\end{gathered}$
&&
$\begin{gathered}
{\scriptstyle\cad}\\[-2\jot]
\begin{smallmatrix}
\no\enspace &\enspace\yes
\end{smallmatrix}\\[-\jot]
\begin{bmatrix}10&0\\8&3\\7&5\\0&10\end{bmatrix}\end{gathered}$
&&
$\begin{gathered}
{\scriptstyle\cad}\\[-2\jot]
\begin{smallmatrix}
\no\enspace &\enspace\yes
\end{smallmatrix}\\[-\jot]
\begin{bmatrix}10&0\\9&3\\8&5\\0&10\end{bmatrix}\end{gathered}$
\\[6\jot]
\hline
\end{tabular}
\\[1em]
\textit{Expected utilities and optimal treatments}\\
\begin{tabular}{lcccc}
\hline\\[-1.5\jot]
&{\small Olivia} &{\small Ariel} &{\small Bianca} &{\small Curtis}
\\[\jot]
$\begin{matrix}
\text{treatment }\alpha\\
\text{treatment }\beta\\
\text{treatment }\gamma\\
\text{treatment }\delta\\[\jot]
\textbf{optimal}
\end{matrix}$
&
$\begin{matrix}6.98\\\bm{7.19}\\7.09\\3.02\\[\jot]\bm{\beta}\end{matrix}$
&
$\begin{matrix}5.27\\6.16\\\bm{6.58}\\4.73\\[\jot]\bm{\gamma}\end{matrix}$
&
$\begin{matrix}\bm{6.98}\\6.49\\6.40\\3.02\\[\jot]\bm{\alpha}\end{matrix}$
&
$\begin{matrix}2.97\\4.78\\5.89\\\bm{7.03}\\[\jot]\bm{\delta}\end{matrix}$
\\[5\jot]
\hline
\end{tabular}
\caption{Utility matrices, expected utilities, and optimal treatments for our four patients}\label{tab:utilities_patients}
\end{table}
The probabilities for the two medical conditions are those found in the previous subsection, table~\ref{tab:posterior_patients}. For brevity, we denote just by $\p(\cad)$ the probability of conversion given a patient's predictor values, and by $\p(\smci)\equiv 1- \p(\cad)$ the complementary probability of stable \mci, given the same predictor values. The expected utilities of each treatment for each patient can then be easily computed. For example, for Olivia the expected utility of treatment $\beta$ is
\begin{equation}
\label{eq:utility_olivia_example}
\begin{split}
\eU_{\beta} &=
9 \cdot \p(\cad\mo\no \| \predictors,\dataset,\auxinfo) +
3 \cdot \p(\cad\mo\yes \| \predictors,\dataset,\auxinfo)
\\&= 9 \cdot (1-0.463) + 3 \cdot 0.463 = 7.19
\end{split}
\end{equation}
The results for all patients are reported in table~\ref{tab:utilities_patients}, bottom, with the maximal expected utilities in \textbf{boldface}.
A summary of the clinician's inputs, the \ljm's outputs, and the final decisions is given in table~\ref{tab:summary} on page~\pageref{tab:summary}.
% ## olivia ariel curtis
% ## 1 0.698321 0.527060 0.297440
% ## 2 0.718993 0.616236 0.478464
% ## 3 0.709496 0.658118 0.589232
% ## 4 0.301679 0.472940 0.702560
% ## bianca
% ## 1 0.698321
% ## 2 0.649161
% ## 3 0.639664
% ## 4 0.301679
% \subsection{Maximization of expected benefit}
% \label{sec:expected_utility_step}
%% Mutual-info results
% MI:
% mean sd
% GDTOTAL_gds 0.00052096 0.0003
% Gender_num_ 0.00320230 0.0008
% Apoe4_ 0.00349680 0.0010
% AGE 0.00616330 0.0010
% ANARTERR_neuro 0.00686160 0.0010
% TRAASCOR_neuro 0.01329100 0.0020
% TRABSCOR_neuro 0.02385100 0.0020