## ----set_options, include = FALSE---------------------------------------------
knitr::opts_chunk$set(
  eval = FALSE, # Chunks of codes will not be evaluated by default
  collapse = TRUE,
  comment = "#>",
  fig.width = 7, fig.height = 5   # Set device size at rendering time (when plots are generated)
)

## ----setup, eval = TRUE, include = FALSE--------------------------------------
library(deepSTRAPP)

is_dev_version <- function (pkg = "deepSTRAPP")
{
  # # Check if ran on CRAN
  # not_cran <- identical(Sys.getenv("NOT_CRAN"), "true") # || interactive()

  # Version number check
  version <- tryCatch(as.character(utils::packageVersion(pkg)), error = function(e) "")
  dev_version <- grepl("\\.9000", version)

  # not_cran || dev_version
  
  return(dev_version)
}


## ----adjust_dpi_CRAN, include = FALSE, eval = !is_dev_version()---------------
knitr::opts_chunk$set(
  dpi = 72   # Lower DPI to save space
)

## ----adjust_dpi_dev, include = FALSE, eval = is_dev_version()-----------------
# knitr::opts_chunk$set(
#   dpi = 72   # Default DPI for the dev version
# )

## ----model_trait_evolution_biogeo---------------------------------------------
# 
# # ------ Step 0: Load data ------ #
# 
# ## Load phylogeny and tip data
# 
# library(phytools)
# data(eel.tree)
# data(eel.data)
# # Dataset of feeding mode and maximum total length from 61 species of elopomorph eels.
# # Source: Collar, D. C., P. C. Wainwright, M. E. Alfaro, L. J. Revell, and R. S. Mehta (2014)
# # Biting disrupts integration to spur skull evolution in eels. Nature Communications, 5, 5505.
# 
# # Transform feeding mode data into biogeographic data with ranges A, B, and AB.
# # This is NOT actual biogeographic data, but data fake generated for the sake of example!
# eel_range_tip_data <- stats::setNames(eel.data$feed_mode, rownames(eel.data))
# eel_range_tip_data <- as.character(eel_range_tip_data)
# eel_range_tip_data[eel_range_tip_data == "bite"] <- "A"
# eel_range_tip_data[eel_range_tip_data == "suction"] <- "B"
# eel_range_tip_data[c(5, 6, 7, 15, 25, 32, 33, 34, 50, 52, 57, 58, 59)] <- "AB"
# eel_range_tip_data <- stats::setNames(eel_range_tip_data, rownames(eel.data))
# table(eel_range_tip_data)
# 
# # Here, the input date is a vector of character strings with tip.label as names, and range data as values.
# # Range coding scheme must follow the coding scheme used in BioGeoBEARS:
#   # Unique areas must be in CAPITAL letters (e.g., "A", "B")
#   # Ranges encompassing multiple areas are formed in combining unique area letters
#   # in alphabetic order (e.g., "AB", "BC", "ABC")
# eel_range_tip_data
# 
# ## Convert into binary presence/absence matrix
# 
# # deepSTRAPP also accept biogeographic data as binary presence/absence matrix or data.frame
# # Here is the equivalent biogeographic data converted into a valid binary presence/absence matrix
# 
# # Extract ranges
# all_ranges <- levels(as.factor(eel_range_tip_data))
# # Order in number of areas x alphabetic order (i.e., single areas, then 2-area ranges, etc.)
# unique_areas <- all_ranges[nchar(all_ranges) == 1]
# unique_areas <- unique_areas[order(unique_areas)]
# multi_area_ranges <- setdiff(all_ranges, unique_areas)
# multi_area_ranges <- multi_area_ranges[order(multi_area_ranges)]
# all_ranges_ordered <- c(unique_areas, multi_area_ranges)
# # Create template matrix only with unique areas
# eel_range_binary_matrix <- matrix(data = 0,
#                                   nrow = length(eel_range_tip_data),
#                                   ncol = length(unique_areas),
#                                   dimnames = list(names(eel_range_tip_data), unique_areas))
# # Fill with presence/absence data
# for (i in seq_along(eel_range_tip_data))
# {
#   # i <- 1
# 
#   # Extract range for taxa i
#   range_i <- eel_range_tip_data[i]
#   # Decompose range in unique areas
#   all_unique_areas_in_range_i <- unlist(strsplit(x = range_i, split = ""))
#   # Record match in eel_range_tip_data vector
#   eel_range_binary_matrix[i, all_unique_areas_in_range_i] <- 1
# }
# eel_range_binary_matrix
# 
# # Rows are taxa. Columns are unique areas. Values are presence/absence recorded in each area with '0/1'.
# # Taxa with multi-area ranges (i.e., encompassing multiple unique areas)
# # have multiple '1' in the same row.
# 
# 
# ## Check that trait tip data and phylogeny are named and ordered similarly
# all(names(eel_range_tip_data) == eel.tree$tip.label)
# all(row.names(eel_range_binary_matrix) == eel.tree$tip.label)
# 
# # Reorder tip_data as in phylogeny
# eel_range_tip_data <- eel_range_tip_data[eel.tree$tip.label]
# # Reorder eel_range_binary_matrix as in phylogeny
# eel_range_binary_matrix <- eel_range_binary_matrix[eel.tree$tip.label, ]
# 
# ## Set colors per ranges
# colors_per_ranges <- c("dodgerblue3", "gold")
# names(colors_per_ranges) <- c("A", "B")
# 
# # If you provide only colors for unique areas (e.g., "A", "B"), the colors of multi-area ranges
# # will be interpolated (e.g., "AB")
# # In this example, using types of blue and yellow for range "A" and "B" respectively
# # will yield a type of green for multi-area range "AB".
# 
# 
# # ------ Step 1: Prepare biogeographic data ------ #
# 
# ## Goal: Map biogeographic range evolution on the time-calibrated phylogeny
# 
# # 1.1/ Fit biogeographic evolutionary models to trait data using Maximum Likelihood.
# # 1.2/ Select the best fitting model comparing AICc.
# # 1.3/ Infer ancestral characters estimates (ACE) of ranges at nodes.
# # 1.4/ Run Biogeographic Stochastic Mapping (BSM) simulations to generate biogeographic histories
# #      compatible with the best model and inferred ACE.
# # 1.5/ Infer ancestral ranges along branches.
# #  - Compute posterior frequencies of each range to produce a `densityMap` for each range.
# 
# library(deepSTRAPP)
# 
# # All these actions are performed by a single function: deepSTRAPP::prepare_trait_data()
# ?deepSTRAPP::prepare_trait_data()
# # Model selection is performed internally with deepSTRAPP::select_best_model_from_geiger()
# ?deepSTRAPP::select_best_model_from_geiger()
# # Plotting of all densityMaps as a unique phylogeny is carried out
# # with deepSTRAPP::plot_densityMaps_overlay()
# ?deepSTRAPP::plot_densityMaps_overlay()
# 
# ## The R package `BioGeoBEARS` is needed for this workflow to process biogeographic data.
# ## Please install it manually from: https://github.com/nmatzke/BioGeoBEARS.
# 
# ## Macroevolutionary models for biogeographic data
# 
# # For more in-depth information on the models available see the R package [BioGeoBEARS]
# # See the associated website: http://phylo.wikidot.com/biogeobears
# 
# ## Parameters in BioGeoBEARS
# 
# # Biogeographic models are based on a set of biogeographic events defined by a set of parameters
# 
#   ## Anagenetic events = occurring along branches
#   # d = dispersal rate = anagenetic range extension. Ex: A -> AB
#   # e = extinction rate = anagenetic range contraction. Ex: AB -> A
# 
#   ## Cladogenetic events = occurring at speciation
#   # y = Non-transitional speciation relative weight = cladogenetic inheritance.
#     # Ex: Narrow inheritance: A -> (A),(A); Wide-spread sympatric speciation: AB -> (AB),(AB)
#   # v = Vicariance relative weight = cladogenetic vicariance.
#     # Ex: Narrow vicariance: AB -> (A),(B); Wide-vicariance: ABCD -> (AB),(CD)
#   # s = Subset speciation relative weight = cladogenetic sympatric speciation.
#     # Ex: AB -> (AB),(A)
#   # j = Jump-dispersal relative weight = cladogenetic founder-event.
#     # Ex: A -> (A),(B)
# 
# ## 6 models from BioGeoBEARS are available
# 
#  # "BAYAREALIKE": BAYAREALIKE is a likelihood interpretation of the Bayesian implementation
#  # of BAYAREA model, and it is "like BAYAREA".
#     # It is the "simpler" model, that allows the least types of biogeographic events to occur.
#     # Nothing is happening during cladogenesis. Only inheritance of previous range (y = 1).
#       # Allows narrow and wide-spread sympatric speciation: A -> (A),(A); AB => (AB),(AB)
#     # No narrow or wide vicariance (v = 0)
#     # No subset sympatric speciation (s = 0)
#     # No jump dispersal (j = 0)
#     # Model has 2 free parameters = d (range extension), e (range contraction).
# 
#  # "DIVALIKE": DIVALIKE is a likelihood interpretation of parsimony DIVA, and it is "like DIVA".
#     # Compared to BAYAREALIKE, it allows vicariance events to occur during speciation.
#     # Allows narrow sympatric speciation (y): A -> (A),(A)
#       # Does NOT allow wide-spread sympatric speciation.
#     # Allows and narrow AND wide-spread vicariance (v): AB -> (A),(B); ABCD -> (AB),(CD)
#     # No subset sympatric speciation (s = 0)
#     # No jump dispersal (j = 0)
#     # Relative weights of y and v are fixed to 1.
#     # Model has 2 free parameters = d (range extension), e (range contraction).
# 
#  # "DEC": Dispersal-Extinction-Cladogenesis model. This is the default model in deepSTRAPP.
#     # Compared to BAYAREALIKE, it allows subset speciation (s) to occur,
#     # but it does not allows wide-spread vicariance.
#     # Allows narrow sympatric speciation (y): A -> (A),(A)
#       # Does NOT allow wide-spread sympatric speciation.
#     # Allows narrow vicariance (v): AB -> (A),(B)
#       # Does NOT allow wide-spread vicariance.
#     # Allows subset sympatric speciation: AB -> (AB),(A)
#     # No jump dispersal (j = 0)
#     # Relative weights of y, v, and s are fixed to 1.
#     # Model has 2 free parameters = d (range extension), e (range contraction).
# 
#  # "...+J": All previous models can add jump-dispersal events with the parameter j.
#     # Allows jump-dispersal events to occur at speciation: A -> (A),(B)
#       # Depicts cladogenetic founder events where a small population disperse to a new area.
#       # Isolation results in speciation of the two populations in distinct lineages
#       # occurring in two different areas.
#     # Relative weights of y,v,s are fixed to 1-j ("BAYAREALIKE+J"), 2-j ("DIVALIKE+J"), or 3-j ("DEC+J").
#     # Models have only 3 free parameters = d (range extension), e (range contraction),
#     # and j (jump dispersal).
# 
# ## Model trait data evolution
# eel_biogeo_data <- prepare_trait_data(
#      tip_data = eel_range_tip_data,
#      # Alternative input using the binary presence/absence range matrix
#      # tip_data = eel_range_binary_matrix,
#      trait_data_type = "biogeographic",
#      phylo = eel.tree,
#      seed = 1234, # Set seed for reproducibility
#      # All models available
#      evolutionary_models = c("BAYAREALIKE", "DIVALIKE", "DEC", "BAYAREALIKE+J", "DIVALIKE+J", "DEC+J"),
#      # To provide link to the directory folder where the outputs of BioGeoBEARS will be saved
#      BioGeoBEARS_directory_path = "./BioGeoBEARS_directory/",
#      # Whether to save BioGeoBEARS intermediate files, or remove them after the run
#      keep_BioGeoBEARS_files = TRUE,
#      prefix_for_files = "eel", # Prefixe used to save BioGeoBEARS intermediate files
#      # To set the number of core to use for computation.
#      # Parallelization over multiple cores may speed up the process.
#      nb_cores = 1,
#      # To define the maximum number of unique areas in ranges. Ex: "AB" = 2; "ABC" = 3.
#      max_range_size = 2,
#      split_multi_area_ranges = TRUE, # Set to TRUE to display the two outputs
#      res = 100, # To set the resolution of the continuous mapping of ranges on the densityMaps
#      colors_per_levels = colors_per_ranges,
#      # Reduce the number of Stochastic Mapping simulations to save time (Default = '1000')
#      nb_simulations = 100,
#      # Plot the densityMaps with plot_densityMaps_overlay() to show all ranges at once.
#      plot_overlay = TRUE,
#      # PDF_file_path = "./eel_biogeo_evolution_mapped_on_phylo.pdf",
#      # To include Ancestral Character Estimates (ACE) of ranges at nodes in the output
#      return_ace = TRUE,
#      # To include the lists of cladogenetic and anagenetic events produced with BioGeoBEARS::runBSM()
#      return_BSM = FALSE,
#      # To include the Biogeographic Stochastic Mapping simulations (simmaps) in the output
#      return_simmaps = TRUE,
#      # To include the best model fit in the output
#      return_best_model_fit = TRUE,
#      # To include the df for model selection in the output
#      return_model_selection_df = TRUE,
#      verbose = TRUE) # To display progress
# 
# # ------ Step 2: Explore output ------ #
# 
# ## Explore output
# str(eel_biogeo_data, 1)
# 
# 
# ## Extract the densityMaps showing posterior probabilities of ranges on the phylogeny
# ## as estimated from the model
# eel_densityMaps <- eel_biogeo_data$densityMaps
# 
# # Because we selected 'split_multi_area_ranges = TRUE',
# # those densityMaps only record posterior probability of presence in the two unique areas "A" and "B".
# # Probability of presences in multi-area range "AB" have been equally split between "A" and "B".
# # This is useful when you wish to test for differences in rates
# # between unique areas only (e.g., "A" and "B").
# 
# # densityMaps including all ranges are also available in the output
# eel_densityMaps_all_ranges <- eel_biogeo_data$densityMaps_all_ranges
# # If you wish to test for differences across all types of ranges (e.g., "A", "B", and "AB"),
# # you need to use these densityMaps, or set 'split_multi_area_ranges = FALSE'
# # such as no split of multi-area ranges is performed, and the densityMaps contains all ranges by default.
# 
# ## Plot densityMap for each range
# # Grey represents absence of the range. Color represents presence of the range.
# plot(eel_densityMaps_all_ranges[[1]]) # densityMap for range n°1 ("A")
# plot(eel_densityMaps_all_ranges[[2]]) # densityMap for range n°2 ("B")
# plot(eel_densityMaps_all_ranges[[3]]) # densityMap for range n°3 ("AB")
# 
# ## Plot all densityMaps overlaid in on a single phylogeny
# # Each color highlights presence of its associated range.
# 
# # Plot densityMaps considering only unique areas
# plot_densityMaps_overlay(eel_densityMaps)
# # Plot densityMaps with all ranges, including multi-area ranges
# plot_densityMaps_overlay(eel_densityMaps_all_ranges)
# 
# # The densityMaps are the main input needed to perform a deepSTRAPP run on categorical trait data.
# 
# 
# ## Extract the Ancestral Character Estimates (ACE) = trait values at nodes
# 
# # Only with unique areas
# eel_ACE <- eel_biogeo_data$ace
# head(eel_ACE)
# 
# # Including multi-area ranges (Here, "AB")
# eel_ACE_all_ranges <- eel_biogeo_data$ace_all_ranges
# head(eel_ACE_all_ranges)
# 
# # This is a matrix of numerical values with row.names = internal node ID,
# # colnames = ancestral ranges and values = posterior probability.
# # It can be used as an optional input in deepSTRAPP run to provide
# # perfectly accurate estimates for ancestral ranges at internal nodes.
# 
# 
# ## Explore summary of model selection
# eel_biogeo_data$model_selection_df # Summary of model selection
# # Models are compared using the corrected Akaike's Information Criterion (AICc)
# # Akaike's weights represent the probability that a given model is the best
# # among the set of candidate models, given the data.
# # Here, the best model is the "DEC+J" model
# 
# 
# ## Explore best model fit (DEC+J model)
# # Summary of best model optimization by BioGeoBEARS::bears_optim_run()
# str(eel_biogeo_data$best_model_fit, 1)
# # Parameter estimates and likelihood optimization information
# eel_biogeo_data$best_model_fit$optim_result
#   # p1 = d = dispersal rate = anagenetic range extension. Ex: A -> AB
#   # p2 = e = extinction rate = anagenetic range contraction. Ex: AB -> A
#   # p3 = j = jump-dispersal relative weight = cladogenetic founder-event. Ex: A -> (A),(B)
# 
# 
# ## Explore Biogeographic Stochastic Mapping outputs from BioGeoBEARS::runBSM()
# # Since we selected 'return_BSM = TRUE', lists of cladogenetic and anagenetic events produced
# # with BioGeoBEARS::runBSM() are included in the output
# ?BioGeoBEARS::runBSM()
# 
# # This element contains two lists of data.frames summarizing cladogenetic
# # and anagenetic events occurring all BSM simulations.
# # Each simulation yields a data.frame for each list.
# str(eel_biogeo_data$BSM_output, 1)
# 
# # Example: Cladogenetic event summary for simulation n°1
# head(eel_biogeo_data$BSM_output$RES_clado_events_tables[[1]])
# # Example: Anagenetic event summary for simulation n°1
# head(eel_biogeo_data$BSM_output$RES_ana_events_tables[[1]])
# 
# 
# ## Explore simmaps
# # Since we selected 'return_simmaps = TRUE',
# # Stochastic Mapping simulations (simmaps) are included in the output
# # Each simmap represents a simulated evolutionary history with final ranges
# # compatible with the tip_data and estimated ACE at nodes.
# # Each simmap also follows the transition parameters of the best fit model
# # to simulate transitions along branches.
# 
# # Update color_per_ranges to include the color interpolated for range "AB"
# AB_color_gradient <- eel_densityMaps_all_ranges$Density_map_AB$cols
# AB_color <- AB_color_gradient[length(AB_color_gradient)]
# 
# colors_per_all_ranges <- c(colors_per_ranges, AB_color)
# names(colors_per_all_ranges) <- c("A", "B", "AB")
# 
# # Plot simmap n°1 using the same color scheme as in densityMaps
# plot(eel_biogeo_data$simmaps[[1]], colors = colors_per_all_ranges, fsize = 0.7)
# # Plot simmap n°10 using the same color scheme as in densityMaps
# plot(eel_biogeo_data$simmaps[[10]], colors = colors_per_all_ranges, fsize = 0.7)
# # Plot simmap n°100 using the same color scheme as in densityMaps
# plot(eel_biogeo_data$simmaps[[100]], colors = colors_per_all_ranges, fsize = 0.7)
# 
# 
# ## Inputs needed to run deepSTRAPP are the densityMaps (eel_densityMaps),
# ## and optionally, the tip_data (eel_tip_data), and the ACE (eel_ACE)
# 
# 

## ----model_trait_evolution_biogeo_eval, fig.height = 7, eval = TRUE, echo = FALSE, out.width = "100%"----
# Load directly output
data(eel_biogeo_data, package = "deepSTRAPP")

## Plot densityMaps

# Plot densityMaps considering only unique areas
plot_densityMaps_overlay(eel_biogeo_data$densityMaps, fsize = 0.6)
title(main = "Ancestral Ranges - Unique areas")
# Plot densityMaps with all ranges, including multi-area ranges
plot_densityMaps_overlay(eel_biogeo_data$densityMaps_all_ranges, fsize = 0.6)
title(main = "Ancestral Ranges - All ranges")
## Explore summary of model selection
eel_biogeo_data$model_selection_df # Summary of model selection

## Explore simmaps

colors_per_all_ranges <- c("dodgerblue3", "gold", "#0DE632")
names(colors_per_all_ranges) <- c("A", "B", "AB")

plot(eel_biogeo_data$simmaps[[1]], colors = colors_per_all_ranges, fsize = 0.7)
title(main = "\nStochastic Mapping simulation n°1")

plot(eel_biogeo_data$simmaps[[10]], colors = colors_per_all_ranges, fsize = 0.7)
title(main = "\nStochastic Mapping simulation n°10")


