A healthcare-centric take on the famous quartet.
Francis Anscombe was an English statistician.
Things he did in his life had names like the Anscombe Transform and the theory of Subjective Probability.
Oh, and his brother-in-law was some dude named John Tukey.
That’s all fine and good, but I want to talk about what has come to be known as Anscombe’s Quartet:
x1 <- c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)
y1 <- c(8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68)
x2 <- c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)
y2 <- c(9.14, 8.14, 8.74, 8.77, 9.26, 8.1, 6.13, 3.1, 9.13, 7.26, 4.74)
x3 <- c(10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5)
y3 <- c(7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73)
x4 <- c(8, 8, 8, 8, 8, 8, 8, 19, 8, 8, 8)
y4 <- c(6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.5, 5.56, 7.91, 6.89)
quartet <- as.data.frame(cbind(x1, y1, x2, y2, x3, y3, x4, y4))
quart_tbl <- reactable(
quartet,
pagination = FALSE,
outlined = TRUE,
defaultColDef = colDef(
footerStyle = list(fontWeight = "bold"),
headerClass = "col-header",
footerClass = "col-footer",
align = "left"
),
columnGroups = list(
colGroup(name = "I",
columns = c("x1", "y1")),
colGroup(name = "II",
columns = c("x2", "y2")),
colGroup(name = "III",
columns = c("x3", "y3")),
colGroup(name = "IV",
columns = c("x4", "y4"))
),
columns = list(x1 = colDef(
footer = function(values)
sprintf("%.2f", mean(values)),
name = "x",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
y1 = colDef(
style = list(
borderRight = "1px solid rgba(0, 0, 0, 0.1)"),
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "y",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center",
),
x2 = colDef(
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "x",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
y2 = colDef(
style = list(
borderRight = "1px solid rgba(0, 0, 0, 0.1)"),
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "y",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
x3 = colDef(
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "x",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
y3 = colDef(
style = list(
borderRight = "1px solid rgba(0, 0, 0, 0.1)"),
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "y",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
x4 = colDef(
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "x",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
y4 = colDef(
footer =
function(values)
sprintf("%.2f",
mean(values)),
name = "y",
format = colFormat(
digits = 1
),
defaultSortOrder = "desc",
align = "center"
),
html = TRUE
),
compact = FALSE,
class = "rcm-tbl"
)
div(
class = "rcm-analysis",
div(
class = "rcm-header",
div(class = "rcm-title", "Anscombe's Quartet"),
"Francis Anscombe, American Statistician: Graphs in Statistical Analysis (1973)"
),
quart_tbl
)
In 1973, Anscombe constructed the four data sets you see above. Each set consists of eleven points for an \(x\) coordinate and a \(y\) coordinate. The \(x\) values are the same for the first three datasets.
And they have nearly identical summary statistics:
| Property | Value |
|---|---|
| Mean of \(x\) | 9 |
| Mean of \(y\) | 7.50 |
| Variance of \(x\) | 11 |
| Variance of \(y\) | 4.125 |
| Correlation between \(x\) and \(y\) | 0.816 |
| Linear Regression line | \(y\) = 3.00 + 0.500\(x\) |
| \(R\) Squared | 0.67 |
Why is this important?
Why did he do this?
Because exploring and visualizing a dataset is important.
Compare the four sets in the table to the following graphs:
library(billboarder)
set_theme("datalab")
quartet2 <- quartet %>%
mutate(
row = row_number()
) %>%
tidyr::pivot_longer(
!row,
names_to = "set",
values_to = "values"
)
billboarder(
width = 700,
height = 400
) %>%
bb_barchart(
data = quartet2,
color = "#102246",
mapping = bbaes(x = set, y = values, group = row)
) %>%
bb_legend(show = FALSE) %>%
bb_labs(
title = "Anscombe's Quartet",
y = "Values",
x = "Variables"
)
xf <- c(4, 6, 8, 10, 12, 14, 19)
yf <- c(5, 6, 7, 8, 9, 10, 12.5)
fit <- as.data.frame(cbind(xf, yf))
# Set 1
quart_hc1 <- hchart(quartet, "scatter",
hcaes(x1, y1)) %>%
hc_add_series(fit, "line",
hcaes(xf, yf)
)%>%
hc_yAxis(
title = list(text = "y1"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_xAxis(
title = list(text = "x1"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
valueDecimals = 2,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_xAxis(min = 4)
# Set 2
quart_hc2 <- hchart(quartet, "scatter",
hcaes(x2, y2)) %>%
hc_add_series(fit, "line",
hcaes(xf, yf)
)%>%
hc_yAxis(
title = list(text = "y2"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_xAxis(
title = list(text = "x2"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
valueDecimals = 2,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_xAxis(min = 4)
# Set 3
quart_hc3 <- hchart(quartet, "scatter",
hcaes(x3, y3)) %>%
hc_add_series(fit, "line",
hcaes(xf, yf)
)%>%
hc_yAxis(
title = list(text = "y3"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_xAxis(
title = list(text = "x3"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
valueDecimals = 2,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_xAxis(min = 4)
# Set 4
quart_hc4 <- hchart(quartet, "scatter",
hcaes(x4, y4)) %>%
hc_add_series(fit, "line",
hcaes(xf, yf)
)%>%
hc_yAxis(
title = list(text = "y4"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_xAxis(
title = list(text = "x4"),
labels = list(format = "{value}"),
crosshair = list(
snap = TRUE,
width = 2,
zIndex = 0)
) %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
valueDecimals = 2,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_xAxis(min = 4)
Anscombe specifically constructed this quartet to emphasize the importance of visualizing data before beginning to analyze it. What would you expect from John Tukey’s in-law?
Now, what does this have to do with healthcare, medical billing and coding, the revenue cycle, etc.? Glad you asked: summary statistics are not to be trusted on their own.
Lets say I have the following data from a client:
rcm_1half <- rcmann %>%
filter(date <= "2021-06-30") %>%
select(date,
visit,
uniqpt,
newpt,
em,
rvu
)
rcm_tb1 <- rcm_1half %>%
reactable(
pagination = FALSE,
outlined = TRUE,
defaultColDef = colDef(
footerStyle = list(fontWeight = "bold"),
headerClass = "col-header",
footerClass = "col-footer",
align = "left"
),
columns = list(
date = colDef(
name = "Date",
footer = "Averages"
),
visit = colDef(
name = "Visits",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
uniqpt = colDef(
name = "Patients",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
newpt = colDef(
name = "New Patients",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
em = colDef(
name = "E/M Visits",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
rvu = colDef(
name = "RVUs",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 2
),
defaultSortOrder = "desc"
),
html = TRUE
),
compact = FALSE,
class = "rcm-tbl"
)
div(
class = "rcm-analysis",
div(
class = "rcm-header",
div(class = "rcm-title", "Encounter Related Data: Q1 through Q2 2021")
),
rcm_tb1
)
rcm_2half <- rcmann %>%
filter(date > "2021-06-30") %>%
select(
visit,
uniqpt,
newpt,
em,
rvu
)
rcm_1half2 <- rcm_1half %>%
select(!(date))
rcm_2half_new <- mimic(rcm_2half, rcm_1half2)
rcm_2half_new <- as.data.frame(rcm_2half_new)
rcm_2half_new <- rcm_2half_new %>%
rename(
visit = new1,
uniqpt = new2,
newpt = new3,
em = new4,
rvu = new5
)
date <- c(
"2021-07-31",
"2021-08-31",
"2021-09-30",
"2021-10-31",
"2021-11-30",
"2021-12-31"
)
rcm_2half <- cbind(date, rcm_2half_new)
rcm_tb2 <- rcm_2half %>%
reactable(
pagination = FALSE,
outlined = TRUE,
defaultColDef = colDef(
footerStyle = list(fontWeight = "bold"),
headerClass = "col-header",
footerClass = "col-footer",
align = "left"
),
columns = list(
date = colDef(name = "Date",
footer = "Averages"
),
visit = colDef(
name = "Visits",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
uniqpt = colDef(
name = "Patients",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
newpt = colDef(
name = "New Patients",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
em = colDef(
name = "E/M Visits",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 0
),
defaultSortOrder = "desc"
),
rvu = colDef(
name = "RVUs",
footer = function(values)
sprintf("%.2f", mean(values)),
format = colFormat(
digits = 2
),
defaultSortOrder = "desc"
),
html = TRUE
),
compact = FALSE,
class = "rcm-tbl"
)
div(
class = "rcm-analysis",
div(
class = "rcm-header",
div(class = "rcm-title", "Encounter Related Data: Q3 through Q4 2021")
),
rcm_tb2
)
rcm_1half_visum <- rcm_1half %>%
select(visit) %>%
summarise(visit = mean(visit))
rcm_2half_visum <- rcm_2half %>%
select(visit) %>%
summarise(visit = mean(visit))
rcm_visum <- rbind(rcm_1half_visum, rcm_2half_visum)
yhalf <- c("1H2021", "2H2021")
rcm_visum_2021 <- cbind(yhalf, rcm_visum)
# Six Month Average Number of Visits
hc_visum <- hchart(rcm_visum_2021, "column",
hcaes(yhalf, round(visit, 0), color = yhalf)) %>%
hc_yAxis(
gridLineWidth = 0,
labels = list(style = list(color = "#000000")),
title = list(text = " ", style = list(color = "#000000"))
) %>%
hc_xAxis(
labels = list(style = list(color = "#000000")),
title = list(text= " "),
lineWidth = 0,
tickWidth = 0
) %>%
hc_title(text = "Patient Visits 2021") %>%
hc_subtitle(text = "Six Month Averages") %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_plotOptions(
column = list(
#color = "#4BBD85",
dataLabels = list(
valueDecimals = 2,
valueSuffix = '%',
enabled = TRUE)
)
) %>%
hc_size(height = 400, width = 350)
rcm_1halfvis <- rcm_1half %>%
select(
date,
visit
)
rcm_1halfvis$date <- as.character(rcm_1halfvis$date)
rcm_2halfvis <- rcm_2half %>%
select(
date,
visit
)
# Number of Visits - First Half of 2021
hc_1hvis <- hchart(rcm_1halfvis, "column",
hcaes(date, round(visit, 0), color = date)) %>%
hc_yAxis(
gridLineWidth = 0,
labels = list(style = list(color = "#000000")),
title = list(text = " ", style = list(color = "#000000"))
) %>%
hc_xAxis(
labels = list(style = list(color = "#000000")),
title = list(text= " "),
lineWidth = 0,
tickWidth = 0
) %>%
hc_title(text = "Average Patient Visits: 884") %>%
hc_subtitle(text = "January to June 2021") %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_plotOptions(
column = list(
#color = "#4BBD85",
dataLabels = list(
valueDecimals = 2,
valueSuffix = '%',
enabled = TRUE)
)
)
# Number of Visits - Second Half of 2021
hc_2hvis <- hchart(rcm_2halfvis, "column",
hcaes(date, round(visit, 0), color = date)) %>%
hc_yAxis(
gridLineWidth = 0,
labels = list(style = list(color = "#000000")),
title = list(text = " ", style = list(color = "#000000"))
) %>%
hc_xAxis(
labels = list(style = list(color = "#000000")),
title = list(text= " "),
lineWidth = 0,
tickWidth = 0
) %>%
hc_title(text = "Average Patient Visits: 884") %>%
hc_subtitle(text = "July to December 2021") %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
useHTML = TRUE,
crosshairs = TRUE,
borderWidth = 1,
sort = TRUE) %>%
hc_plotOptions(
column = list(
#color = "#4BBD85",
dataLabels = list(
valueDecimals = 2,
valueSuffix = '%',
enabled = TRUE)
)
)
crosstalk::bscols(
widths = NA,
hc_1hvis,
hc_2hvis
)
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. Source code is available at https://github.com/andrewallenbruce, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Bruce (2022, April 14). Andrew Bruce: Anscombe's Quartet: Healthcare Edition. Retrieved from https://andrewbruce.netlify.app/posts/2022-04-12-anscombes-quartet-healthcare-edition/
BibTeX citation
@misc{bruce2022anscombe's,
author = {Bruce, Andrew},
title = {Andrew Bruce: Anscombe's Quartet: Healthcare Edition},
url = {https://andrewbruce.netlify.app/posts/2022-04-12-anscombes-quartet-healthcare-edition/},
year = {2022}
}