7  Command Compendium

This chapter is designed to help you easily craft commands.

ggplot color options

7.1 Basic Data Manipulation

7.1.1 Creating Objects

Creating an object allows you to store the output of a command in the environment. This means you can reuse the result later by referencing the object name, instead of rerunning the command every time.

## When this command runs, it will show up in the environment
chart <- ggplot(DATASET_OBJECT, aes(y = VARIABLE OF INTEREST)) +
  geom_boxplot(fill = "lightgrey")

##now I can "call" the object
chart

7.1.2 Filtering Data

To create our analytical dataset we often need to subset a larger dataset. Filtering allows us to simplify. We always want to save filtered data as objects so that we can reference them in other commands.

#to select certain columns
columns <- DATASET |> select(COLUMN1, COLUMN2, COLUMN3)

### IF VALUES ARE STRING (NON-NUMERIC) THEY MUST BE IN "" 

#to filter by values in a certain column
values <- DATASET |> filter(COLUMN == VALUE)

# to filter by multiple conditions. 
#and
values <- DATASET |> filter(COLUMN == VALUE & COLUMN2 == VALUE2)

#or 
values <- DATASET |> filter(COLUMN == VALUE | COLUMN2 == VALUE2)

#in 
values <- DATASET |> filter(COLUMN %in% c(VALUE1, VALUE2, VALUE3))

#numeric operator
values <- DATASET |> filter(COLUMN < 2000)

7.1.3 Making New Columns

#creating new columns
dataset <- DATASET |> mutate(NEWCOLUMN = COMMAND, NEWCOLUMN2 = COMMAND)

7.1.4 Renaming Columns

#renaming columns
clean_data <- DATASET |> rename(
  new_name1 = old_name1,
  new_name2 = old_name2
)

7.1.5 Grouping and Summarizing Data

summary_data <- DATASET |> 
  group_by(CATEGORY_COLUMN) |> 
  summarize(
    mean_value = mean(NUMERIC_COLUMN, na.rm = TRUE),
    count = n()
  )

7.1.6 Creating a Spatial Object

#assumes a nonspatial object with a longitude and latitude field
grouped_pm_sf <- YOUR_NEWOBJECT |> st_as_sf(coords= c("lat", "lon"), crs = 4326)

7.2 Descriptive Statistics

7.2.1 Descriptive Statistics Tables

#requires e1071 package
#requires gt package
## descriptive statistics for quantitative data
  DATASET_OBJECT |> #whatever the object is named
  st_drop_geometry() |> #gets rid of geometry column
  select(VARIABLE_OF_INTEREST) |> #field name of variable of interest
  summarise(                  #summarizes variable of interest. You can add/remove
    n = n(),
    num_na = sum(is.na(VARIABLE_OF_INTEREST)), 
    mean = mean(VARIABLE_OF_INTEREST, na.rm = TRUE),
    median = median(VARIABLE_OF_INTEREST, na.rm = TRUE),
    sd = sd(VARIABLE_OF_INTEREST, na.rm = TRUE),
    variance = var(VARIABLE_OF_INTEREST, na.rm = TRUE),
    mean_dev = mean(abs(VARIABLE_OF_INTEREST - mean(VARIABLE_OF_INTEREST, na.rm = TRUE)), na.rm = TRUE),
    cv = sd / mean,
    min = min(VARIABLE_OF_INTEREST, na.rm = TRUE),
    max = max(VARIABLE_OF_INTEREST, na.rm = TRUE),
    skewness = skewness(VARIABLE_OF_INTEREST, na.rm = TRUE), 
    kurtosis = kurtosis(VARIABLE_OF_INTEREST, na.rm = TRUE)
                               
  ) |> 
    #this starts the table formatting.
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value") |>
  gt() |>
  tab_header(
    title = "TITLE", ##Add your title here
  ) %>%
  fmt_number(
    columns = everything(),
    decimals = 2 ##You can change number of decimals
  )

## Descriptive statistics table for categorical data
DATASET_OBJECT |> #whatever the object is named
  st_drop_geometry() |> #gets rid of geometry column
  group_by(ruca_code) |> #variable that you want to summarize
  summarise(n = n())  |> #calculates the count
  gt() #formats into a table

7.2.2 Histogram

#requires ggplot2 package
## Create a basic histogram. 
## You will likely need to modify the binwidth command
ggplot(DATASET_OBJECT, aes(x = VARIABLE OF INTEREST)) +
  geom_histogram(binwidth = 1, fill = "lightgrey", color = "black", alpha = 0.7) +
  labs(
    title = "TITLE",
    x = "X-AXIS LABEL",
    y = "Y-AXIS LABEL"
  ) 

7.2.3 Boxplot

#requires ggplot2 package
# create basic boxplot
ggplot(DATASET_OBJECT, aes(y = VARIABLE OF INTEREST)) +
  geom_boxplot(fill = "lightgrey") +
  labs(
    title = "TITLE",
    y = "Y-AXIS LABEL"
  )

7.2.4 Violin Plot

#requires ggplot2 package
##you will likely need to modify the bw
ggplot(DATASET_OBJECT, aes(x = 1, y = VARIABLE_OF_INTEREST)) +
  geom_violin(fill = "lightgrey", bw = 1.2)  + geom_boxplot(width=0.1) +
  labs(
    title = "TITLE",
    y = "Y AXIS LABEL"
  ) + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

7.2.5 Bar Chart

#requires ggplot2 package
##YOU NEED TO FIRST GROUP THE DATA
grouped_data <- DATASET_OBJECT |> st_drop_geometry() |> 
  group_by(VARIABLE_OF_INTEREST) |>
  summarise(Count = n()) 

#this plots the bar plot (note that the coord_flip swtiches the axis which is optional
ggplot(grouped_data, aes(x=VARIABLE_OF_INTEREST, y=Count)) + 
  geom_bar(stat = "identity") + coord_flip()  + labs(
    x = "X_AXIS_LABEL",
    y = "Y_AXIS_LABEL",
  )

7.3 Maps

#required tmap package
## classification styles
## "equal"
## "fisher"
## "pretty"
## "quantile"

#command to see color palettes
cols4all::c4a_gui()

##make a basic point map
tm_shape(DATABASE_OBJECT) + tm_dots(fill = "VARIABLE_OF_INTEREST",
                fill.scale = tm_scale_intervals(values = "brewer.COLORPALETTE", style = "CLASSIFICATION", n= NUMBEROFCLASSES)) 

##make a basic polygon map
tm_shape(DATABASE_OBJECT) + tm_polygon(fill = "VARIABLE_OF_INTEREST",
                fill.scale = tm_scale_intervals(values = "brewer.COLOR_PALETTE", style = "CLASSIFICATION", n= NUMBEROFCLASSES)) 


## make an interactive map
tmap_mode(mode = "view")

##go back to static map 
tmap_mode(mode = "plot")


## add title 
MAPCOMMANDS + tm_title("TITLE")


## Add transparency
tm_shape(DATABASE_OBJECT) + tm_polygon(fill = "VARIABLE_OF_INTEREST",fill_alpha = VALUE) 

7.4 Descriptive Spatial Statistics

7.4.1 Mean Center and Weighted Mean Center

#requires sfdep package

#calculate mean center
DATASET_mean_center <- center_mean(DATASET)

#calculate weighted mean center
DATASET_mean_center <- center_mean(DATASET, weight = DATASET$VARIABLE)

7.4.2 Standard Deviational Ellipse

#requires sfdep package
#calculate standard ellipse values
std_ellip_DATASET <- std_dev_ellipse(DATASET)

#create an ellipse of those values
std_ellip_DATASET <- sfdep::st_ellipse(geometry =std_ellip_DATASET,
                                   sx = std_ellip_DATASET$sx,
                                   sy = std_ellip_DATASET$sy,
                                   rotation = -std_ellip_DATASET$theta)

7.5 Probability

7.5.1 Binomial Distribution

#per trial probability, x = number of trials, p = probability per trial
probability_per_trial <- tibble(
  num_trials = 0:X,
  #here is the main probability function
  probability = dbinom(x = 0:X, size = X, prob = p)
)

#cumulative probability
cum_prob <- pbinom(VALUELESSTHAN, TOTALTRIALS, p = p)

7.5.2 Normal Distribution

#pnorm calculates the CDF up to the value. Do 1-pnorm to get the probability above that value
prob <- pnorm(VALUEOFINTEREST, mean =  mean(DATASET$VARIABLE), sd = sd(DATASET$VARIABLE))

7.6 Estimation in Sampling

7.6.1 Point Estimate and CI for Mean

#t test command (assumes two-tail)
t.test(DATASET$VARIABLE, conf.level = 0.95)

7.6.2 Point Estimate and CI for Proportion

prop.test(TOTALSUCCESSES, n, conf.level = 0.95)

7.7 Hypothesis Testing

7.7.1 One-Sample T-Test

#remove alternative to run two-sided T-test
t.test(DATASET$VARIABLE, mu = TRUEMEAN, alternative = "less", conf.level = 0.95)

7.7.2 Two-Sample T-Test

#test for equality of variance
leveneTest(VARIABLE ~ CATEGORY, data = DATASET)

#change var.equal if Levene's test indicates unequal variance
t.test(VARIABLE ~ CATEGORY, data = DATASET, var.equal = TRUE)

7.7.3 Paired T-Test

#remove alternative to run two-sided
t.test(DATASET$VARIABLE1, DATASET$VARIABLE2, paired = TRUE, alternative = "greater", conf.level = 0.95)

7.7.4 Chi Square Test of Independence

chisq.test(table(DATASET$VARIABLE1, DATASET$VARIABLE2))

7.8 Point Pattern Analysis

7.8.1 Quadrat Analysis

#boundary must be owin object
boundary_owin <- BOUNDARYOBJECT |> as.owin()

#turn point data into into a ppp
data.ppp <- as.ppp(st_coordinates(DATA), W = boundary_owin)

#create a quadrat count with 9 quadrats (modify the number of quadrats to fit your case)
q_count <- quadratcount(data.ppp, nx = 3, ny = 3)

#run quadrat test with monte carlo sims
q_test <- quadrat.test(data.ppp, nx = 3, ny = 3, method= "MonteCarlo", nsim=9)

7.8.2 Kernel Density

#estimate KDE
kernel_density <- density(data.ppp)

7.8.3 ANN Analysis

#compute observed ANN
ANN_obs <- mean(nndist(data.ppp))

nsim <- 999
ANN_sim <- numeric(nsim)

#MC simulations
for(i in 1:nsim){
  sim <- rpoispp(kernel_density) #simulate inhomogeneous Poisson using KDE if necessary
  ANN_sim[i] <- mean(nndist(sim))
}

#monte Carlo p-value (two-sided)
N_greater <- sum(ANN_sim >= ANN_obs)
p_value <- 2 * min((N_greater + 1)/(nsim + 1), (nsim - N_greater + 1)/(nsim + 1))

p_value

7.8.4 L-function

#first point pattern demonstrated global structure, therefore we use an inhomogenous Poisson as comparison. Correction corrects for edge effects. 
l_funct_pattern1 <- envelope(
  data.ppp,
  fun = Linhom, #inhomogenous, use Lest for homogenous
  nsim = 99,
  funargs = list(correction = "border"),
  transform = expression(. - r)   
)

7.9 Spatial Autocorrelation and Spatial Modeling

7.9.1 Calculating Neighbors

## QUEENS CASE NEIGHBORS ###
#calculate neighbors
nb_queen <- poly2nb(DATASET, queen = TRUE) # queen shares point or border
#set neighborhood weight matrix
nbw_queen <- nb2listw(nb_queen, style = "W")


## DISTANCE BASED NEIGHBORS ##
#calculate neighborhoods, threshold is units in CRS
nb_dist <- dnearneigh(schools_sf, d1 = 0, d2 = THRESHOLD) 
#set neighborhood weight matrix
nbw_dist <- nb2listw(nb_dist, style = "W", zero.policy = T)


## K NEAREST NEIGHBORS ####
#calculate nearest neighbors k= number of neighbors
knn<- knearneigh(DATASET, k = NUMNEIGHBORS)
#calculate neighborhoods
nb_knn  <- knn2nb(knn)
#set neighborhood weight matrix
nbw_knn <- nb2listw(nb_knn, style = "W")

7.9.2 Global Moran’s I

gmoran <- moran.test(DATASET$VARIABLE, NBWMATRIX)

7.9.3 LISA

locali<-localmoran_perm(DATASET$VARIABLE, NBWMATRIX) %>%
  as_tibble() %>%
  set_names(c("local_i", "exp_i", "var_i", "z_i", "p_i",
              "p_i_sim", "pi_sim_folded", "skewness", "kurtosis"))

#bind LISA results to dataset
DATASET <- DATASET %>%
  bind_cols(locali)

#Divide into quadrants
DATASET <- DATASET %>%
  mutate(variablez =  as.numeric(scale(VARIABLE)),
         evratezlag = lag.listw(NBWMATRIX, variablez),
         lisa_cluster = case_when(
           p_i >= 0.05 ~ "Not significant",
           variablez > 0 & local_i > 0 ~ "High-high",
           variablez > 0 & local_i < 0 ~ "High-low",
           variablez < 0 & local_i > 0 ~ "Low-low",
           variablez < 0 & local_i < 0 ~ "Low-high"
         ))

7.9.4 Traditional Spatial Models

#lag model
fit.lag<-lagsarlm(DEPENDENT ~ INDEPENDENT + INDEPENDENT,  
                  data = DATA, 
                  listw = nbw) 
#see results
summary(fit.lag)

#error model
fit.error<-errorsarlm(DEPENDENT ~ INDEPENDENT + INDEPENDENT,  
                  data = DATA, 
                  listw = nbw)  
#see results
summary(fit.error)

7.9.5 Bayesian Model

nb <- poly2nb(DATA)

nb2INLA("map.adj")
g <- inla.read.graph(filename = "map.adj")

DATA$re_u <- 1:nrow(DATA)

formula <- DEPENDENT ~ INDEPENDENT + INDEPENDENT +
  f(re_u, model = "bym2", graph = g)

res <- inla(
  formula,
  family = "gaussian",
  data = DATA,
  control.predictor = list(compute = TRUE),
  control.compute = list(return.marginals.predictor = TRUE)
)

7.10 Continuous Surfaces

7.10.1 IDW

# create a regular prediction grid. We say we want 2000 pixels
grd <- as.data.frame(spsample(DATA, "regular", n = 2000))

#set grid coordinates
names(grd) <- c("X", "Y")
coordinates(grd) <- c("X", "Y")
gridded(grd) <- TRUE
fullgrid(grd) <- TRUE

#assign CRS
crs(grd) <- crs(heatwave_sp)

# run IDW interpolation using the best IDP from cross-validation
idw <- idw(
  formula = value ~ 1,
  locations = DATA,
  newdata = grd,
  idp = 2
)

#rasterize interpolation output
r <- rast(idw)

#mask to boundaries
r_m <- mask(r, BOUNDARY)

7.10.2 Kriging

#compute empirical variogram
v <- variogram(value ~ 1, data = DATA)

#plot empirical variogram
plot(v)

#specify an initial variogram model
vgm_initial <- vgm(
  psill = FILL,
  model = "FILL",
  range = FILL,
  nugget = FILL
)

#fit empirical to initial model
vgm_fit <- fit.variogram(v, vgm_initial)

#plot fitted variogram model 
plot(v, vgm_fit)


#fit kriging
krig_pred <- krige(
  formula = value ~ 1,
  locations = DATA,
  newdata = grd,
  model = vgm_fit
)

#rasterize predictions
krig_rast <- rast(krig_pred)

#mask to boundary
krig_mask <- mask(krig_rast, BOUNDARY)