Some example data science plots in R using `ggplot2`

. See https://github.com/WinVector/WVPlots for code/details.

```
set.seed(34903490)
= rnorm(50)
x = 0.5*x^2 + 2*x + rnorm(length(x))
y = data.frame(
frm x = x,
y = y,
yC = y>=as.numeric(quantile(y,probs=0.8)),
stringsAsFactors = FALSE)
$absY <- abs(frm$y)
frm$posY = frm$y > 0 frm
```

Scatterplot with smoothing line through points.

`::ScatterHist(frm, "x", "y", title="Example Fit") WVPlots`

Scatterplot with best linear fit through points. Also report the R-squared and significance of the linear fit.

```
::ScatterHist(frm, "x", "y", smoothmethod="lm",
WVPlotstitle="Example Linear Fit", estimate_sig = TRUE)
```

Scatterplot compared to the line `x = y`

. Also report the coefficient of determination between `x`

and `y`

(where `y`

is “true outcome” and `x`

is “predicted outcome”).

```
::ScatterHist(frm, "x", "y", smoothmethod="identity",
WVPlotstitle="Example Relation Plot", estimate_sig = TRUE)
```

Scatterplot of *(x, y)* color-coded by category/group, with marginal distributions of *x* and *y* conditioned on group.

```
set.seed(34903490)
= data.frame(
fmScatterHistC x=rnorm(50),
y=rnorm(50),
stringsAsFactors = FALSE)
$cat <- fmScatterHistC$x+fmScatterHistC$y>0
fmScatterHistC::ScatterHistC(fmScatterHistC, "x", "y", "cat", title="Example Conditional Distribution") WVPlots
```

Scatterplot of *(x, y)* color-coded by discretized *z*. The continuous variable *z* is binned into three groups, and then plotted as by `ScatterHistC`

```
set.seed(34903490)
= data.frame(
frmScatterHistN x=rnorm(50),
y=rnorm(50),
stringsAsFactors = FALSE)
$z <- frmScatterHistN$x+frmScatterHistN$y
frmScatterHistN::ScatterHistN(frmScatterHistN, "x", "y", "z", title="Example Joint Distribution") WVPlots
```

Plot the relationship *y* as a function of *x* with a smoothing curve that estimates \(E[y | x]\). If *y* is a 0/1 variable as below (binary classification, where 1 is the target class), then the smoothing curve estimates \(P(y | x)\). Since \(y \in \{0,1\}\) with \(y\) intended to be monotone in \(x\) is the most common use of this graph, `BinaryYScatterPlot`

uses a `glm`

smoother by default (`use_glm=TRUE`

, this is essentially Platt scaling), as the best estimate of \(P(y | x)\).

```
::BinaryYScatterPlot(frm, "x", "posY", use_glm=FALSE,
WVPlotstitle="Example 'Probability of Y' Plot (ggplot2 smoothing)")
```

```
::BinaryYScatterPlot(frm, "x", "posY", use_glm=TRUE,
WVPlotstitle="Example 'Probability of Y' Plot (GLM smoothing)")
```

```
if(requireNamespace("hexbin", quietly = TRUE)) {
set.seed(5353636)
= rbind(data.frame(x=rnorm(1000, mean = 1),
df y=rnorm(1000, mean = 1, sd = 0.5 ),
stringsAsFactors = FALSE),
data.frame(x = rnorm(1000, mean = -1, sd = 0.5),
y = rnorm(1000, mean = -1, sd = 0.5),
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
print(WVPlots::HexBinPlot(df, "x", "y", "Two gaussians"))
}
```

```
set.seed(34903490)
= abs(rnorm(20)) + 0.1
y = abs(y + 0.5*rnorm(20))
x
= data.frame(
frm model=x,
value=y,
stringsAsFactors = FALSE)
$costs=1
frm$costs[1]=5
frm$rate = with(frm, value/costs)
frm
$isValuable = (frm$value >= as.numeric(quantile(frm$value, probs=0.8))) frm
```

Basic curve: each item “costs” the same. The wizard sorts by true value, the x axis sorts by the model, and plots the fraction of the total population.

`::GainCurvePlot(frm, "model", "value", title="Example Continuous Gain Curve") WVPlots`

We can annotate a point of the model at a specific x value

```
= 0.10 # get the top 10% most valuable points as sorted by the model
gainx
# make a function to calculate the label for the annotated point
= function(gx, gy) {
labelfun = gx*100
pctx = gy*100
pcty
paste("The top ", pctx, "% most valuable points by the model\n",
"are ", pcty, "% of total actual value", sep='')
}
::GainCurvePlotWithNotation(frm, "model", "value",
WVPlotstitle="Example Gain Curve with annotation",
gainx=gainx,labelfun=labelfun)
```

When the `x`

values have different costs, take that into account in the gain curve. The wizard now sorts by value/cost, and the x axis is sorted by the model, but plots the fraction of total cost, rather than total count.

`::GainCurvePlotC(frm, "model", "costs", "value", title="Example Continuous Gain CurveC") WVPlots`

```
set.seed(34903490)
# data with two different regimes of behavior
<- rbind(
frm data.frame(
model = rnorm(1000),
isValuable = sample(c(TRUE, FALSE), prob = c(0.02, 0.98), size = 1000, replace = TRUE)),
data.frame(
model = rnorm(200) + 5,
isValuable = sample(c(TRUE, FALSE), size = 200, replace = TRUE))
)
::ROCPlot(frm, "model", "isValuable", TRUE, title="Example ROC plot") WVPlots
```

Plotting the ROC of two models on the same data, where predictions and true outcome all in the same data frame.

```
set.seed(34903490)
= rnorm(50)
x1 = rnorm(length(x1))
x2 = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
y = data.frame(
frmP x1=x1,
x2=x2,
yC = y>=as.numeric(quantile(y,probs=0.8)),
stringsAsFactors = FALSE)
# WVPlots::ROCPlot(frmP, "x1", "yC", TRUE, title="Example ROC plot")
# WVPlots::ROCPlot(frmP, "x2", "yC", TRUE, title="Example ROC plot")
::ROCPlotPair(frmP, "x1", "x2", "yC", TRUE, title="Example ROC pair plot") WVPlots
```

Plotting the results from two data sets, for example the results of a model on training and test sets, where predictions/outcome for the two data sets are in different data frames.

```
set.seed(2342458)
<- function(nrows) {
make_data <- data.frame(x = rnorm(nrows))
d 'y'] = sin(d['x']) + 0.25*rnorm(n = nrows)
d['x2'] = rnorm(n = nrows)
d['yc'] = d[['y']]>0.5
d[return(d)
}
<- make_data(500)
training <- make_data(200)
test
<- glm(yc ~ x + x2, data=training, family=binomial)
model
$pred <- predict(model, newdata=training, type="response")
training$pred <- predict(model, newdata=test, type="response")
test
::ROCPlotPair2(nm1 = "Training", # model 1
WVPlotsframe1 = training,
xvar1 = "pred", truthVar1 = "yc", truthTarget1 = TRUE,
nm2 ="Test", # model 2
frame2 = test,
xvar2 = "pred", truthVar2 = "yc", truthTarget2 = TRUE,
title = "Model performance, training vs test",
estimate_sig = FALSE)
```