7 Command Compendium

This chapter is designed to help you easily craft commands.

7.1 Basic Data Manipulation

7.1.1 Creating Objects

Creating an object allows you to store the output of a command in the environment. This means you can reuse the result later by referencing the object name, instead of rerunning the command every time.

## When this command runs, it will show up in the environment
chart <- ggplot(DATASET_OBJECT, aes(y = VARIABLE OF INTEREST)) +
  geom_boxplot(fill = "lightgrey")

##now I can "call" the object
chart

7.1.2 Filtering Data

To create our analytical dataset we often need to subset a larger dataset. Filtering allows us to simplify. We always want to save filtered data as objects so that we can reference them in other commands.

#to select certain columns
columns <- DATASET |> select(COLUMN1, COLUMN2, COLUMN3)

### IF VALUES ARE STRING (NON-NUMERIC) THEY MUST BE IN "" 

#to filter by values in a certain column
values <- DATASET |> filter(COLUMN == VALUE)

# to filter by multiple conditions. 
#and
values <- DATASET |> filter(COLUMN == VALUE & COLUMN2 == VALUE2)

#or 
values <- DATASET |> filter(COLUMN == VALUE | COLUMN2 == VALUE2)

#in 
values <- DATASET |> filter(COLUMN %in% c(VALUE1, VALUE2, VALUE3))

#numeric operator
values <- DATASET |> filter(COLUMN < 2000)

7.1.3 Making New Columns

#creating new columns
dataset <- DATASET |> mutate(NEWCOLUMN = COMMAND, NEWCOLUMN2 = COMMAND)

7.1.4 Renaming Columns

#renaming columns
clean_data <- DATASET |> rename(
  new_name1 = old_name1,
  new_name2 = old_name2
)

7.1.5 Grouping and Summarizing Data

summary_data <- DATASET |> 
  group_by(CATEGORY_COLUMN) |> 
  summarize(
    mean_value = mean(NUMERIC_COLUMN, na.rm = TRUE),
    count = n()
  )

7.1.6 Creating a Spatial Object

#assumes a nonspatial object with a longitude and latitude field
grouped_pm_sf <- YOUR_NEWOBJECT |> st_as_sf(coords= c("lat", "lon"), crs = 4326)

7.2 Descriptive Statistics

7.2.1 Descriptive Statistics Tables

#requires e1071 package
#requires gt package
## descriptive statistics for quantitative data
  DATASET_OBJECT |> #whatever the object is named
  st_drop_geometry() |> #gets rid of geometry column
  select(VARIABLE_OF_INTEREST) |> #field name of variable of interest
  summarise(                  #summarizes variable of interest. You can add/remove
    n = n(),
    num_na = sum(is.na(VARIABLE_OF_INTEREST)), 
    mean = mean(VARIABLE_OF_INTEREST, na.rm = TRUE),
    median = median(VARIABLE_OF_INTEREST, na.rm = TRUE),
    sd = sd(VARIABLE_OF_INTEREST, na.rm = TRUE),
    variance = var(VARIABLE_OF_INTEREST, na.rm = TRUE),
    mean_dev = mean(abs(VARIABLE_OF_INTEREST - mean(VARIABLE_OF_INTEREST, na.rm = TRUE)), na.rm = TRUE),
    cv = sd / mean,
    min = min(VARIABLE_OF_INTEREST, na.rm = TRUE),
    max = max(VARIABLE_OF_INTEREST, na.rm = TRUE),
    skewness = skewness(VARIABLE_OF_INTEREST, na.rm = TRUE), 
    kurtosis = kurtosis(VARIABLE_OF_INTEREST, na.rm = TRUE)
                               
  ) |> 
    #this starts the table formatting.
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value") |>
  gt() |>
  tab_header(
    title = "TITLE", ##Add your title here
  ) %>%
  fmt_number(
    columns = everything(),
    decimals = 2 ##You can change number of decimals
  )

## Descriptive statistics table for categorical data
DATASET_OBJECT |> #whatever the object is named
  st_drop_geometry() |> #gets rid of geometry column
  group_by(ruca_code) |> #variable that you want to summarize
  summarise(n = n())  |> #calculates the count
  gt() #formats into a table

7.2.2 Histogram

#requires ggplot2 package
## Create a basic histogram. 
## You will likely need to modify the binwidth command
ggplot(DATASET_OBJECT, aes(x = VARIABLE OF INTEREST)) +
  geom_histogram(binwidth = 1, fill = "lightgrey", color = "black", alpha = 0.7) +
  labs(
    title = "TITLE",
    x = "X-AXIS LABEL",
    y = "Y-AXIS LABEL"
  )

7.2.3 Boxplot

#requires ggplot2 package
# create basic boxplot
ggplot(DATASET_OBJECT, aes(y = VARIABLE OF INTEREST)) +
  geom_boxplot(fill = "lightgrey") +
  labs(
    title = "TITLE",
    y = "Y-AXIS LABEL"
  )

7.2.4 Violin Plot

#requires ggplot2 package
##you will likely need to modify the bw
ggplot(DATASET_OBJECT, aes(x = 1, y = VARIABLE_OF_INTEREST)) +
  geom_violin(fill = "lightgrey", bw = 1.2)  + geom_boxplot(width=0.1) +
  labs(
    title = "TITLE",
    y = "Y AXIS LABEL"
  ) + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

7.2.5 Bar Chart

#requires ggplot2 package
##YOU NEED TO FIRST GROUP THE DATA
grouped_data <- DATASET_OBJECT |> st_drop_geometry() |> 
  group_by(VARIABLE_OF_INTEREST) |>
  summarise(Count = n()) 

#this plots the bar plot (note that the coord_flip swtiches the axis which is optional
ggplot(grouped_data, aes(x=VARIABLE_OF_INTEREST, y=Count)) + 
  geom_bar(stat = "identity") + coord_flip()  + labs(
    x = "X_AXIS_LABEL",
    y = "Y_AXIS_LABEL",
  )

7.3 Maps

#required tmap package
## classification styles
## "equal"
## "fisher"
## "pretty"
## "quantile"

#command to see color palettes
cols4all::c4a_gui()

##make a basic point map
tm_shape(DATABASE_OBJECT) + tm_dots(fill = "VARIABLE_OF_INTEREST",
                fill.scale = tm_scale_intervals(values = "brewer.COLORPALETTE", style = "CLASSIFICATION", n= NUMBEROFCLASSES)) 

##make a basic polygon map
tm_shape(DATABASE_OBJECT) + tm_polygon(fill = "VARIABLE_OF_INTEREST",
                fill.scale = tm_scale_intervals(values = "brewer.COLOR_PALETTE", style = "CLASSIFICATION", n= NUMBEROFCLASSES)) 


## make an interactive map
tmap_mode(mode = "view")

##go back to static map 
tmap_mode(mode = "plot")


## add title 
MAPCOMMANDS + tm_title("TITLE")


## Add transparency
tm_shape(DATABASE_OBJECT) + tm_polygon(fill = "VARIABLE_OF_INTEREST",fill_alpha = VALUE)

7.4 Descriptive Spatial Statistics

7.4.1 Mean Center and Weighted Mean Center

#requires sfdep package

#calculate mean center
DATASET_mean_center <- center_mean(DATASET)

#calculate weighted mean center
DATASET_mean_center <- center_mean(DATASET, weight = DATASET$VARIABLE)

7.4.2 Standard Deviational Ellipse

#requires sfdep package
#calculate standard ellipse values
std_ellip_DATASET <- std_dev_ellipse(DATASET)

#create an ellipse of those values
std_ellip_DATASET <- sfdep::st_ellipse(geometry =std_ellip_DATASET,
                                   sx = std_ellip_DATASET$sx,
                                   sy = std_ellip_DATASET$sy,
                                   rotation = -std_ellip_DATASET$theta)

7.5 Probability

7.5.1 Binomial Distribution

#per trial probability, x = number of trials, p = probability per trial
probability_per_trial <- tibble(
  num_trials = 0:X,
  #here is the main probability function
  probability = dbinom(x = 0:X, size = X, prob = p)
)

#cumulative probability
cum_prob <- pbinom(VALUELESSTHAN, TOTALTRIALS, p = p)

7.5.2 Normal Distribution

#pnorm calculates the CDF up to the value. Do 1-pnorm to get the probability above that value
prob <- pnorm(VALUEOFINTEREST, mean =  mean(DATASET$VARIABLE), sd = sd(DATASET$VARIABLE))

7.6 Estimation in Sampling

7.6.1 Point Estimate and CI for Mean

#t test command (assumes two-tail)
t.test(DATASET$VARIABLE, conf.level = 0.95)

7.6.2 Point Estimate and CI for Proportion

prop.test(TOTALSUCCESSES, n, conf.level = 0.95)

7.7 Hypothesis Testing

7.7.1 One-Sample T-Test

#remove alternative to run two-sided T-test
t.test(DATASET$VARIABLE, mu = TRUEMEAN, alternative = "less", conf.level = 0.95)

7.7.2 Two-Sample T-Test

#test for equality of variance
leveneTest(VARIABLE ~ CATEGORY, data = DATASET)

#change var.equal if Levene's test indicates unequal variance
t.test(VARIABLE ~ CATEGORY, data = DATASET, var.equal = TRUE)

7.7.3 Paired T-Test

#remove alternative to run two-sided
t.test(DATASET$VARIABLE1, DATASET$VARIABLE2, paired = TRUE, alternative = "greater", conf.level = 0.95)

7.7.4 Chi Square Test of Independence

chisq.test(table(DATASET$VARIABLE1, DATASET$VARIABLE2))

7.8 Point Pattern Analysis

7.8.1 Quadrat Analysis

#boundary must be owin object
boundary_owin <- BOUNDARYOBJECT |> as.owin()

#turn point data into into a ppp
data.ppp <- as.ppp(st_coordinates(DATA), W = boundary_owin)

#create a quadrat count with 9 quadrats (modify the number of quadrats to fit your case)
q_count <- quadratcount(data.ppp, nx = 3, ny = 3)

#run quadrat test with monte carlo sims
q_test <- quadrat.test(data.ppp, nx = 3, ny = 3, method= "MonteCarlo", nsim=9)

7.8.2 Kernel Density

#estimate KDE
kernel_density <- density(data.ppp)

7.8.3 ANN Analysis

#compute observed ANN
ANN_obs <- mean(nndist(data.ppp))

nsim <- 999
ANN_sim <- numeric(nsim)

#MC simulations
for(i in 1:nsim){
  sim <- rpoispp(kernel_density) #simulate inhomogeneous Poisson using KDE if necessary
  ANN_sim[i] <- mean(nndist(sim))
}

#monte Carlo p-value (two-sided)
N_greater <- sum(ANN_sim >= ANN_obs)
p_value <- 2 * min((N_greater + 1)/(nsim + 1), (nsim - N_greater + 1)/(nsim + 1))

p_value

7.8.4 L-function

#first point pattern demonstrated global structure, therefore we use an inhomogenous Poisson as comparison. Correction corrects for edge effects. 
l_funct_pattern1 <- envelope(
  data.ppp,
  fun = Linhom, #inhomogenous, use Lest for homogenous
  nsim = 99,
  funargs = list(correction = "border"),
  transform = expression(. - r)   
)

7.9 Spatial Autocorrelation and Spatial Modeling

7.9.1 Calculating Neighbors

## QUEENS CASE NEIGHBORS ###
#calculate neighbors
nb_queen <- poly2nb(DATASET, queen = TRUE) # queen shares point or border
#set neighborhood weight matrix
nbw_queen <- nb2listw(nb_queen, style = "W")


## DISTANCE BASED NEIGHBORS ##
#calculate neighborhoods, threshold is units in CRS
nb_dist <- dnearneigh(schools_sf, d1 = 0, d2 = THRESHOLD) 
#set neighborhood weight matrix
nbw_dist <- nb2listw(nb_dist, style = "W", zero.policy = T)


## K NEAREST NEIGHBORS ####
#calculate nearest neighbors k= number of neighbors
knn<- knearneigh(DATASET, k = NUMNEIGHBORS)
#calculate neighborhoods
nb_knn  <- knn2nb(knn)
#set neighborhood weight matrix
nbw_knn <- nb2listw(nb_knn, style = "W")

7.9.2 Global Moran’s I

gmoran <- moran.test(DATASET$VARIABLE, NBWMATRIX)

7.9.3 LISA

locali<-localmoran_perm(DATASET$VARIABLE, NBWMATRIX) %>%
  as_tibble() %>%
  set_names(c("local_i", "exp_i", "var_i", "z_i", "p_i",
              "p_i_sim", "pi_sim_folded", "skewness", "kurtosis"))

#bind LISA results to dataset
DATASET <- DATASET %>%
  bind_cols(locali)

#Divide into quadrants
DATASET <- DATASET %>%
  mutate(variablez =  as.numeric(scale(VARIABLE)),
         evratezlag = lag.listw(NBWMATRIX, variablez),
         lisa_cluster = case_when(
           p_i >= 0.05 ~ "Not significant",
           variablez > 0 & local_i > 0 ~ "High-high",
           variablez > 0 & local_i < 0 ~ "High-low",
           variablez < 0 & local_i > 0 ~ "Low-low",
           variablez < 0 & local_i < 0 ~ "Low-high"
         ))

7.9.4 Traditional Spatial Models

#lag model
fit.lag<-lagsarlm(DEPENDENT ~ INDEPENDENT + INDEPENDENT,  
                  data = DATA, 
                  listw = nbw) 
#see results
summary(fit.lag)

#error model
fit.error<-errorsarlm(DEPENDENT ~ INDEPENDENT + INDEPENDENT,  
                  data = DATA, 
                  listw = nbw)  
#see results
summary(fit.error)

7.9.5 Bayesian Model

nb <- poly2nb(DATA)

nb2INLA("map.adj")
g <- inla.read.graph(filename = "map.adj")

DATA$re_u <- 1:nrow(DATA)

formula <- DEPENDENT ~ INDEPENDENT + INDEPENDENT +
  f(re_u, model = "bym2", graph = g)

res <- inla(
  formula,
  family = "gaussian",
  data = DATA,
  control.predictor = list(compute = TRUE),
  control.compute = list(return.marginals.predictor = TRUE)
)

7.10 Continuous Surfaces

7.10.1 IDW

# create a regular prediction grid. We say we want 2000 pixels
grd <- as.data.frame(spsample(DATA, "regular", n = 2000))

#set grid coordinates
names(grd) <- c("X", "Y")
coordinates(grd) <- c("X", "Y")
gridded(grd) <- TRUE
fullgrid(grd) <- TRUE

#assign CRS
crs(grd) <- crs(heatwave_sp)

# run IDW interpolation using the best IDP from cross-validation
idw <- idw(
  formula = value ~ 1,
  locations = DATA,
  newdata = grd,
  idp = 2
)

#rasterize interpolation output
r <- rast(idw)

#mask to boundaries
r_m <- mask(r, BOUNDARY)

7.10.2 Kriging

#compute empirical variogram
v <- variogram(value ~ 1, data = DATA)

#plot empirical variogram
plot(v)

#specify an initial variogram model
vgm_initial <- vgm(
  psill = FILL,
  model = "FILL",
  range = FILL,
  nugget = FILL
)

#fit empirical to initial model
vgm_fit <- fit.variogram(v, vgm_initial)

#plot fitted variogram model 
plot(v, vgm_fit)


#fit kriging
krig_pred <- krige(
  formula = value ~ 1,
  locations = DATA,
  newdata = grd,
  model = vgm_fit
)

#rasterize predictions
krig_rast <- rast(krig_pred)

#mask to boundary
krig_mask <- mask(krig_rast, BOUNDARY)