7  Command Compendium

This chapter is designed to help you easily craft commands.

ggplot color options

7.1 Basic Data Manipulation

7.1.1 Creating Objects

Creating an object allows you to store the output of a command in the environment. This means you can reuse the result later by referencing the object name, instead of rerunning the command every time.

## When this command runs, it will show up in the environment
chart <- ggplot(DATASET_OBJECT, aes(y = VARIABLE OF INTEREST)) +
  geom_boxplot(fill = "lightgrey")

##now I can "call" the object
chart

7.1.2 Filtering Data

To create our analytical dataset we often need to subset a larger dataset. Filtering allows us to simplify. We always want to save filtered data as objects so that we can reference them in other commands.

#to select certain columns
columns <- DATASET |> select(COLUMN1, COLUMN2, COLUMN3)

### IF VALUES ARE STRING (NON-NUMERIC) THEY MUST BE IN "" 

#to filter by values in a certain column
values <- DATASET |> filter(COLUMN == VALUE)

# to filter by multiple conditions. 
#and
values <- DATASET |> filter(COLUMN == VALUE & COLUMN2 == VALUE2)

#or 
values <- DATASET |> filter(COLUMN == VALUE | COLUMN2 == VALUE2)

#in 
values <- DATASET |> filter(COLUMN %in% c(VALUE1, VALUE2, VALUE3))

#numeric operator
values <- DATASET |> filter(COLUMN < 2000)

7.1.3 Making New Columns

#creating new columns
dataset <- DATASET |> mutate(NEWCOLUMN = COMMAND, NEWCOLUMN2 = COMMAND)

7.1.4 Renaming Columns

#renaming columns
clean_data <- DATASET |> rename(
  new_name1 = old_name1,
  new_name2 = old_name2
)

7.1.5 Grouping and Summarizing Data

summary_data <- DATASET |> 
  group_by(CATEGORY_COLUMN) |> 
  summarize(
    mean_value = mean(NUMERIC_COLUMN, na.rm = TRUE),
    count = n()
  )

7.1.6 Creating a Spatial Object

#assumes a nonspatial object with a longitude and latitude field
grouped_pm_sf <- YOUR_NEWOBJECT |> st_as_sf(coords= c("lat", "lon"), crs = 4326)

7.2 Descriptive Statistics

7.2.1 Descriptive Statistics Tables

#requires e1071 package
#requires gt package
## descriptive statistics for quantitative data
  DATASET_OBJECT |> #whatever the object is named
  st_drop_geometry() |> #gets rid of geometry column
  select(VARIABLE_OF_INTEREST) |> #field name of variable of interest
  summarise(                  #summarizes variable of interest. You can add/remove
    n = n(),
    num_na = sum(is.na(VARIABLE_OF_INTEREST)), 
    mean = mean(VARIABLE_OF_INTEREST, na.rm = TRUE),
    median = median(VARIABLE_OF_INTEREST, na.rm = TRUE),
    sd = sd(VARIABLE_OF_INTEREST, na.rm = TRUE),
    variance = var(VARIABLE_OF_INTEREST, na.rm = TRUE),
    mean_dev = mean(abs(VARIABLE_OF_INTEREST - mean(VARIABLE_OF_INTEREST, na.rm = TRUE)), na.rm = TRUE),
    cv = sd / mean,
    min = min(VARIABLE_OF_INTEREST, na.rm = TRUE),
    max = max(VARIABLE_OF_INTEREST, na.rm = TRUE),
    skewness = skewness(VARIABLE_OF_INTEREST, na.rm = TRUE), 
    kurtosis = kurtosis(VARIABLE_OF_INTEREST, na.rm = TRUE)
                               
  ) |> 
    #this starts the table formatting.
  pivot_longer(everything(), names_to = "Statistic", values_to = "Value") |>
  gt() |>
  tab_header(
    title = "TITLE", ##Add your title here
  ) %>%
  fmt_number(
    columns = everything(),
    decimals = 2 ##You can change number of decimals
  )

## Descriptive statistics table for categorical data
DATASET_OBJECT |> #whatever the object is named
  st_drop_geometry() |> #gets rid of geometry column
  group_by(ruca_code) |> #variable that you want to summarize
  summarise(n = n())  |> #calculates the count
  gt() #formats into a table

7.2.2 Histogram

#requires ggplot2 package
## Create a basic histogram. 
## You will likely need to modify the binwidth command
ggplot(DATASET_OBJECT, aes(x = VARIABLE OF INTEREST)) +
  geom_histogram(binwidth = 1, fill = "lightgrey", color = "black", alpha = 0.7) +
  labs(
    title = "TITLE",
    x = "X-AXIS LABEL",
    y = "Y-AXIS LABEL"
  ) 

7.2.3 Boxplot

#requires ggplot2 package
# create basic boxplot
ggplot(DATASET_OBJECT, aes(y = VARIABLE OF INTEREST)) +
  geom_boxplot(fill = "lightgrey") +
  labs(
    title = "TITLE",
    y = "Y-AXIS LABEL"
  )

7.2.4 Violin Plot

#requires ggplot2 package
##you will likely need to modify the bw
ggplot(DATASET_OBJECT, aes(x = 1, y = VARIABLE_OF_INTEREST)) +
  geom_violin(fill = "lightgrey", bw = 1.2)  + geom_boxplot(width=0.1) +
  labs(
    title = "TITLE",
    y = "Y AXIS LABEL"
  ) + theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

7.2.5 Bar Chart

#requires ggplot2 package
##YOU NEED TO FIRST GROUP THE DATA
grouped_data <- DATASET_OBJECT |> st_drop_geometry() |> 
  group_by(VARIABLE_OF_INTEREST) |>
  summarise(Count = n()) 

#this plots the bar plot (note that the coord_flip swtiches the axis which is optional
ggplot(grouped_data, aes(x=VARIABLE_OF_INTEREST, y=Count)) + 
  geom_bar(stat = "identity") + coord_flip()  + labs(
    x = "X_AXIS_LABEL",
    y = "Y_AXIS_LABEL",
  )

7.3 Maps

#required tmap package
## classification styles
## "equal"
## "fisher"
## "pretty"
## "quantile"

#command to see color palettes
cols4all::c4a_gui()

##make a basic point map
tm_shape(DATABASE_OBJECT) + tm_dots(fill = "VARIABLE_OF_INTEREST",
                fill.scale = tm_scale_intervals(values = "brewer.COLORPALETTE", style = "CLASSIFICATION", n= NUMBEROFCLASSES)) 

##make a basic polygon map
tm_shape(DATABASE_OBJECT) + tm_polygon(fill = "VARIABLE_OF_INTEREST",
                fill.scale = tm_scale_intervals(values = "brewer.COLOR_PALETTE", style = "CLASSIFICATION", n= NUMBEROFCLASSES)) 


## make an interactive map
tmap_mode(mode = "view")

##go back to static map 
tmap_mode(mode = "plot")


## add title 
MAPCOMMANDS + tm_title("TITLE")


## Add transparency
tm_shape(DATABASE_OBJECT) + tm_polygon(fill = "VARIABLE_OF_INTEREST",fill_alpha = VALUE) 

7.4 Descriptive Spatial Statistics

7.4.1 Mean Center and Weighted Mean Center

#requires sfdep package

#calculate mean center
DATASET_mean_center <- center_mean(DATASET)

#calculate weighted mean center
DATASET_mean_center <- center_mean(DATASET, weight = DATASET$VARIABLE)

7.4.2 Standard Deviational Ellipse

#requires sfdep package
#calculate standard ellipse values
std_ellip_DATASET <- std_dev_ellipse(DATASET)

#create an ellipse of those values
std_ellip_DATASET <- sfdep::st_ellipse(geometry =std_ellip_DATASET,
                                   sx = std_ellip_DATASET$sx,
                                   sy = std_ellip_DATASET$sy,
                                   rotation = -std_ellip_DATASET$theta)

7.5 Probability

7.5.1 Binomial Distribution

#per trial probability, x = number of trials, p = probability per trial
probability_per_trial <- tibble(
  num_trials = 0:X,
  #here is the main probability function
  probability = dbinom(x = 0:X, size = X, prob = p)
)

#cumulative probability
cum_prob <- pbinom(VALUELESSTHAN, TOTALTRIALS, p = p)

7.5.2 Normal Distribution

#pnorm calculates the CDF up to the value. Do 1-pnorm to get the probability above that value
prob <- pnorm(VALUEOFINTEREST, mean =  mean(DATASET$VARIABLE), sd = sd(DATASET$VARIABLE))

7.6 Estimation in Sampling

7.6.1 Point Estimate and CI for Mean

#t test command (assumes two-tail)
t.test(DATASET$VARIABLE, conf.level = 0.95)

7.6.2 Point Estimate and CI for Proportion

prop.test(TOTALSUCCESSES, n, conf.level = 0.95)

7.7 Hypothesis Testing

7.7.1 One-Sample T-Test

#remove alternative to run two-sided T-test
t.test(DATASET$VARIABLE, mu = TRUEMEAN, alternative = "less", conf.level = 0.95)

7.7.2 Two-Sample T-Test

#test for equality of variance
leveneTest(VARIABLE ~ CATEGORY, data = DATASET)

#change var.equal if Levene's test indicates unequal variance
t.test(VARIABLE ~ CATEGORY, data = DATASET, var.equal = TRUE)

7.7.3 Paired T-Test

#remove alternative to run two-sided
t.test(DATASET$VARIABLE1, DATASET$VARIABLE2, paired = TRUE, alternative = "greater", conf.level = 0.95)

7.7.4 Chi Square Test of Independence

chisq.test(table(DATASET$VARIABLE1, DATASET$VARIABLE2))

7.8 Point Pattern Analysis

7.8.1 Quadrat Analysis

#boundary must be owin object
boundary_owin <- BOUNDARYOBJECT |> as.owin()

#turn point data into into a ppp
data.ppp <- as.ppp(st_coordinates(DATA), W = boundary_owin)

#create a quadrat count with 9 quadrats (modify the number of quadrats to fit your case)
q_count <- quadratcount(data.ppp, nx = 3, ny = 3)

q_test <- quadrat.test(data.ppp, nx = 3, ny = 3)

7.8.2 Kernel Density

#sigma represents the bandwidth of the kernel. It is in the unit of the data crs. 
kernel_density <- density(data.ppp, 
                          sigma = BANDWIDTH)

7.8.3 G-function

g_pattern <- Gest(data.ppp, correction = "none")

7.8.4 F-function

f_pattern <- Fest(data.ppp, correction = "none")

7.9 Spatial Autocorrelation

7.9.1 Calculating Neighbors

## QUEENS CASE NEIGHBORS ###
#calculate neighbors
nb_queen <- poly2nb(DATASET, queen = TRUE) # queen shares point or border
#set neighborhood weight matrix
nbw_queen <- nb2listw(nb_queen, style = "W")


## DISTANCE BASED NEIGHBORS ##
#calculate neighborhoods, threshold is units in CRS
nb_dist <- dnearneigh(schools_sf, d1 = 0, d2 = THRESHOLD) 
#set neighborhood weight matrix
nbw_dist <- nb2listw(nb_dist, style = "W", zero.policy = T)


## K NEAREST NEIGHBORS ####
#calculate nearest neighbors k= number of neighbors
knn<- knearneigh(DATASET, k = NUMNEIGHBORS)
#calculate neighborhoods
nb_knn  <- knn2nb(knn)
#set neighborhood weight matrix
nbw_knn <- nb2listw(nb_knn, style = "W")

7.9.2 Global Moran’s I

gmoran <- moran.test(DATASET$VARIABLE, NBWMATRIX)

7.9.3 LISA

locali<-localmoran_perm(DATASET$VARIABLE, NBWMATRIX) %>%
  as_tibble() %>%
  set_names(c("local_i", "exp_i", "var_i", "z_i", "p_i",
              "p_i_sim", "pi_sim_folded", "skewness", "kurtosis"))

#bind LISA results to dataset
DATASET <- DATASET %>%
  bind_cols(locali)

#Divide into quadrants
DATASET <- DATASET %>%
  mutate(variablez =  as.numeric(scale(VARIABLE)),
         evratezlag = lag.listw(NBWMATRIX, variablez),
         lisa_cluster = case_when(
           p_i >= 0.05 ~ "Not significant",
           variablez > 0 & local_i > 0 ~ "High-high",
           variablez > 0 & local_i < 0 ~ "High-low",
           variablez < 0 & local_i > 0 ~ "Low-low",
           variablez < 0 & local_i < 0 ~ "Low-high"
         ))