% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/apply-pgs.R
\name{apply.polygenic.score}
\alias{apply.polygenic.score}
\title{Apply polygenic score to VCF data}
\usage{
apply.polygenic.score(
  vcf.data,
  pgs.weight.data,
  phenotype.data = NULL,
  phenotype.analysis.columns = NULL,
  correct.strand.flips = TRUE,
  remove.ambiguous.allele.matches = FALSE,
  remove.mismatched.indels = FALSE,
  output.dir = NULL,
  file.prefix = NULL,
  missing.genotype.method = "mean.dosage",
  use.external.effect.allele.frequency = FALSE,
  n.percentiles = NULL,
  analysis.source.pgs = NULL,
  validate.inputs.only = FALSE
)
}
\arguments{
\item{vcf.data}{A data.frame containing VCF genotype data as formatted by \code{import.vcf()}.}

\item{pgs.weight.data}{A data.frame containing PGS weight data as formatted by \code{import.pgs.weight.file()}.}

\item{phenotype.data}{A data.frame containing phenotype data. Must have an Indiv column matching vcf.data. Default is \code{NULL}.}

\item{phenotype.analysis.columns}{A character vector of phenotype columns from phenotype.data to analyze in a regression analsyis. Default is \code{NULL}.
Phenotype variables are automatically classified as continuous, binary, or neither based on data type and number of unique values. The calculated PGS is associated
with each phenotype variable using linear or logistic regression for continuous or binary phenotypes, respectively. See \code{run.pgs.regression} for more details.
If no phenotype.analysis.columns are provided, no regression analysis is performed.}

\item{correct.strand.flips}{A logical indicating whether to check PGS weight data/VCF genotype data matches for strand flips and correct them. Default is \code{TRUE}.
The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.}

\item{remove.ambiguous.allele.matches}{A logical indicating whether to remove PGS variants with ambiguous allele matches between PGS weight data and VCF genotype data. Default is \code{FALSE}.
The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.}

\item{remove.mismatched.indels}{A logical indicating whether to remove indel variants that are mismatched between PGS weight data and VCF genotype data. Default is \code{FALSE}.
The PGS catalog standard column \code{other_allele} in \code{pgs.weight.data} is required for this check.}

\item{output.dir}{A character string indicating the directory to write output files. Separate files are written for per-sample pgs results and optional regression results.
Files are tab-separate .txt files. Default is NULL in which case no files are written.}

\item{file.prefix}{A character string to prepend to the output file names. Default is \code{NULL}.}

\item{missing.genotype.method}{A character string indicating the method to handle missing genotypes. Options are "mean.dosage", "normalize", or "none". Default is "mean.dosage".}

\item{use.external.effect.allele.frequency}{A logical indicating whether to use an external effect allele frequency for calculating mean dosage when handling missing genotypes. Default is \code{FALSE}.
Provide allele frequency as a column is \code{pgs.weight.data} named \code{allelefrequency_effect}.}

\item{n.percentiles}{An integer indicating the number of percentiles to calculate for the PGS. Default is \code{NULL}.}

\item{analysis.source.pgs}{A character string indicating the source PGS for percentile calculation and regression analyses. Options are "mean.dosage", "normalize", or "none".
When not specified, defaults to \code{missing.genotype.method} choice and if more than one PGS missing genotype method is chosen, calculation defaults to the first selection.}

\item{validate.inputs.only}{A logical indicating whether to only perform input data validation checks without running PGS application.
If no errors are triggered, a message is printed and TRUE is returned. Default is \code{FALSE}.}
}
\value{
A list containing per-sample PGS output and per-phenotype regression output if phenotype analysis columns are provided.

\strong{Output Structure}

The outputed list contains the following elements:
\itemize{
\item pgs.output: A data.frame containing the PGS per sample and optional phenotype data.
\item regression.output: A data.frame containing the results of the regression analysis if phenotype.analysis.columns are provided, otherwise \code{NULL}.
}

pgs.output columns:
\itemize{
\item \code{Indiv}: A character string indicating the sample ID.
\item \code{PGS}: A numeric vector indicating the PGS per sample. (only if missing.genotype.method includes "none")
\item \code{PGS.with.normalized.missing}: A numeric vector indicating the PGS per sample with missing genotypes normalized. (only if missing.genotype.method includes "normalize")
\item \code{PGS.with.replaced.missing}: A numeric vector indicating the PGS per sample with missing genotypes replaced by mean dosage. (only if missing.genotype.method includes "mean.dosage")
\item \code{percentile}: A numeric vector indicating the percentile rank of the PGS.
\item \code{decile}: A numeric vector indicating the decile rank of the PGS.
\item \code{quartile}: A numeric vector indicating the quartile rank of the PGS.
\item \code{percentile.X:} A numeric vector indicating the user-specified percentile rank of the PGS where "X" is substituted by \code{n.percentiles}. (only if \code{n.percentiles} is specified)
\item \code{n.missing.genotypes}: A numeric vector indicating the number of missing genotypes per sample.
\item \code{percent.missing.genotypes}: A numeric vector indicating the percentage of missing genotypes per sample.
\item All columns in \code{phenotype.data} if provided.
}

regression.output columns:
\itemize{
\item phenotype: A character vector of phenotype names.
\item \code{model}: A character vector indicating the regression model used. One of "logistic.regression" or "linear.regression".
\item \code{beta}: A numeric vector indicating the beta coefficient of the regression analysis.
\item \code{se}: A numeric vector indicating the standard error of the beta coefficient.
\item \code{p.value}: A numeric vector indicating the p-value of the beta coefficient.
\item \code{r.squared}: A numeric vector indicating the r-squared value of linear regression analysis. NA for logistic regression.
\item \code{AUC}: A numeric vector indicating the area under the curve of logistic regression analysis. NA for linear regression.
}

\strong{PGS Calculation}

PGS for each individual \emph{i} is calculated as the sum of the product of the dosage and beta coefficient for each variant in the PGS:
\deqn{PGS_i = \sum_{m=1}^{M} \left( \beta_m \times dosage_{im} \right)}
Where \emph{m} is a PGS component variant out of a total \emph{M} variants.

\strong{Missing Genotype Handling}

VCF genotype data are matched to PGS data by chromosome and position. If a SNP cannot be matched by genomic coordinate,
an attempt is made to match by rsID (if available). If a SNP from the PGS weight data is not found in the VCF data after these two matching attempts,
it is considered a cohort-wide missing variant.

Missing genotypes (in individual samples) among successfully matched variants are handled by three methods:

\code{none}: Missing genotype dosages are excluded from the PGS calculation.
This is equivalent to assuming that all missing genotypes are homozygous for the non-effect allele, resulting in a dosage of 0.

\code{normalize}: Missing genotypes are excluded from score calculation but the final score is normalized by the number of non-missing alleles.
The calculation assumes a diploid genome:
\deqn{PGS_i = \dfrac{\sum \left( \beta_m \times dosage_{im} \right)}{P_i * M_{non-missing}}}
Where \emph{P} is the ploidy and has the value \code{2} and \eqn{M_{non-missing}} is the number of non-missing genotypes.

\code{mean.dosage}: Missing genotype dosages are replaced by the mean population dosage of the variant which is calculated as the product of the effect allele frequency \emph{EAF} and the ploidy of a diploid genome:
\deqn{\overline{dosage_{k}} = EAF_k * P}
where \emph{k} is a PGS component variant that is missing in between 1 and n-1 individuals in the cohort and \emph{P} = ploidy = 2
This dosage calculation holds under assumptions of Hardy-Weinberg equilibrium.
By default, the effect allele frequency is calculated from the provided VCF data.
For variants that are missing in all individuals (cohort-wide), dosage is assumed to be zero (homozygous non-reference) for all individuals.
An external allele frequency can be provided in the \code{pgs.weight.data} as a column named \code{allelefrequency_effect} and by setting \code{use.external.effect.allele.frequency} to \code{TRUE}.

\strong{Multiallelic Site Handling}

If a PGS weight file provides weights for multiple effect alleles, the appropriate dosage is calculated for the alleles that each individual carries.
It is assumed that multiallelic variants are encoded in the same row in the VCF data. This is known as "merged" format. Split multiallelic sites are not accepted.
VCF data can be formatted to merged format using external tools for VCF file manipulation.

\strong{Allele Mismatch Handling}
Variants from the PGS weight data are merged with records in the VCF data by genetic coordinate.
After the merge is complete, there may be cases where the VCF reference (REF) and alternative (ALT) alleles do not match their conventional counterparts in the
PGS weight data (other allele and effect allele, respectively).
This is usually caused by a strand flip: the variant in question was called against opposite DNA reference strands in the PGS training data and the VCF data.
Strand flips can be detected and corrected by flipping the affected allele to its reverse complement.
\code{apply.polygenic.score} uses \code{assess.pgs.vcf.allele.match} to assess allele concordance, and is controlled through the following arguments:

\itemize{
\item \code{correct.strand.flips}: When \code{TRUE}, detected strand flips are corrected by flipping the affected value in the \code{effect_allele} column prior to dosage calling.
\item \code{remove.ambiguous.allele.matches}: Corresponds to the \code{return.ambiguous.as.missing} argument in \code{assess.pgs.vcf.allele.match}. When \code{TRUE}, non-INDEL allele
mismatches that cannot be resolved (due to palindromic alleles or causes other than strand flips) are removed by marking the affected value in the \code{effect_allele} column as missing
prior to dosage calling and missing genotype handling. The corresponding dosage is set to NA and the variant is handled according to the chosen missing genotype method.
\item \code{remove.mismatched.indels}: Corresponds to the \code{return.indels.as.missing} argument in \code{assess.pgs.vcf.allele.match}. When \code{TRUE}, INDEL allele mismatches
(which cannot be assessed for strand flips) are removed by marking the affected value in the \code{effect_allele} column as missing prior to dosage calling and missing genotype handling.
The corresponding dosage is set to NA and the variant is handled according to the chosen missing genotype method.
}

Note that an allele match assessment requires the presence of both the \code{other_allele} and \code{effect_allele} in the PGS weight data.
The \code{other_allele} column is not required by the PGS Catalog, and so is not always available.
}
\description{
Apply a polygenic score to VCF data.
}
\examples{
# Example VCF
vcf.path <- system.file(
    'extdata',
    'HG001_GIAB.vcf.gz',
    package = 'ApplyPolygenicScore',
    mustWork = TRUE
    );
vcf.import <- import.vcf(vcf.path);

# Example pgs weight file
pgs.weight.path <- system.file(
    'extdata',
    'PGS000662_hmPOS_GRCh38.txt.gz',
    package = 'ApplyPolygenicScore',
    mustWork = TRUE
    );
pgs.import <- import.pgs.weight.file(pgs.weight.path);

pgs.data <- apply.polygenic.score(
    vcf.data = vcf.import$dat,
    pgs.weight.data = pgs.import$pgs.weight.data,
    missing.genotype.method = 'none'
    );

# Specify different methods for handling missing genotypes
pgs.import$pgs.weight.data$allelefrequency_effect <- rep(0.5, nrow(pgs.import$pgs.weight.data));
pgs.data <- apply.polygenic.score(
    vcf.data = vcf.import$dat,
    pgs.weight.data = pgs.import$pgs.weight.data,
    missing.genotype.method = c('none', 'mean.dosage', 'normalize'),
    use.external.effect.allele.frequency = TRUE
    );

# Specify allele mismatch handling
pgs.data <- apply.polygenic.score(
   vcf.data = vcf.import$dat,
   pgs.weight.data = pgs.import$pgs.weight.data,
   correct.strand.flips = TRUE,
   remove.ambiguous.allele.matches = TRUE,
   remove.mismatched.indels = FALSE
   );

# Provide phenotype data for basic correlation analysis
phenotype.data <- data.frame(
    Indiv = unique(vcf.import$dat$Indiv),
    continuous.phenotype = rnorm(length(unique(vcf.import$dat$Indiv))),
    binary.phenotype = sample(
        c('a', 'b'),
        length(unique(vcf.import$dat$Indiv)),
        replace = TRUE
        )
    );

pgs.data <- apply.polygenic.score(
    vcf.data = vcf.import$dat,
    pgs.weight.data = pgs.import$pgs.weight.data,
    phenotype.data = phenotype.data
    );

# Only run validation checks on input data and report back
apply.polygenic.score(
    vcf.data = vcf.import$dat,
    pgs.weight.data = pgs.import$pgs.weight.data,
    validate.inputs.only = TRUE
    );
}
