diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml new file mode 100644 index 0000000..ade52e5 --- /dev/null +++ b/.github/workflows/deploy.yaml @@ -0,0 +1,49 @@ +name: Deploy CellRegMap + +on: + push: + tags: + - v* + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Get tag name + run: | + echo ${GITHUB_REF##*/} + + # clone the repo to the local directory + - uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: "3.8" + + # In the future, run the tests here + # - name: Test with pytest + # run: | + # python -m unittest test/ + + # Deploy to pypi + - name: Build and publish to pypi + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + pip install wheel twine + python setup.py sdist bdist_wheel + twine upload dist/* + + - name: Login to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + # Build and deploy the docker + - name: Build and publish to docker + run: | + DOCKER_TAG=annasecuomo/cellregmap:${GITHUB_REF##*/} + docker build -t $DOCKER_TAG . + docker push $DOCKER_TAG diff --git a/.gitignore b/.gitignore index 38ce427..ffbcc37 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +docs/_autosummary + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7eba10b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,5 @@ +FROM python:3.8.5 + +ADD cellregmap /app/cellregmap/cellregmap +ADD setup.py setup.cfg version.py proof.md LICENSE MANIFEST.in /app/cellregmap/ +RUN pip install /app/cellregmap diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..97fb9fb --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Anna Cuomo + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100755 index 0000000..762aa52 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include LICENSE +include README.md +include version.py diff --git a/README.md b/README.md index e69de29..1bf02e0 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,47 @@ +# CellRegMap + +Cellular Regulatory Map (CellRegMap) is a linear mixed model approach to perform multi-context eQTL mapping by leveraging single cell RNA sequencing (scRNA-seq) data. +It is related to the previously proposed [StructLMM](https://www.nature.com/articles/s41588-018-0271-0) but importantly it can account for sample structure, including population structure and repeated observations for the same samples, e.g., multiple cells for the same donor. + +The CellRegMap model and its applications to both real and simulated data are described in the [CellRegMap manuscript](https://www.embopress.org/doi/full/10.15252/msb.202110663). + +For more instructions and tutorials to facilitate usage of the package visit [our website](https://limix.github.io/CellRegMap/)! + +## Install + +From your command line, enter + + python3 -m pip install cellregmap + +in your command line. + +## Development + +To install it in development mode, enter + + git clone https://github.com/limix/CellRegMap.git + cd CellRegMap + python3 -m pip install -e . + +in your command line. + +## Installation using a Docker image +If you use Docker, you can also pull the [pre-build image from dockerhub](https://hub.docker.com/r/annasecuomo/cellregmap). + + + diff --git a/cellregmap/__init__.py b/cellregmap/__init__.py new file mode 100644 index 0000000..e8a5609 --- /dev/null +++ b/cellregmap/__init__.py @@ -0,0 +1,20 @@ +from ._cellregmap import CellRegMap, run_association, run_interaction, estimate_betas, run_association_fast +# from ._simulate import create_variances, sample_phenotype, sample_phenotype_gxe +from ._types import Term + +__version__ = "0.0.3" + +# these will be imported when adding ``from cellregmap import *`` +__all__ = [ + "__version__", + "CellRegMap", + "run_association", + "run_association_fast", + "run_interaction", + "estimate_betas", + # "lrt_pvalues", + # "sample_phenotype", + # "create_variances", + # "sample_phenotype_gxe", + "Term", +] diff --git a/cellregmap/_cellregmap.py b/cellregmap/_cellregmap.py new file mode 100644 index 0000000..2a81bd4 --- /dev/null +++ b/cellregmap/_cellregmap.py @@ -0,0 +1,682 @@ +from typing import Optional + +from glimix_core.lmm import LMM +from numpy import ( + asarray, + atleast_1d, + atleast_2d, + concatenate, + inf, + linspace, + ones, + sqrt, + stack, +) +from numpy.linalg import cholesky +from numpy_sugar import ddot +from numpy_sugar.linalg import economic_qs_linear, economic_svd +from tqdm import tqdm + +from ._math import PMat, QSCov, ScoreStatistic + + +class CellRegMap: + """ + Mixed-model with genetic effect heterogeneity. + + The CellRegMap model can be cast as: + + 𝐲 = W𝛂 + 𝐠𝛽₁ + 𝐠⊙𝛃₂ + 𝐞 + 𝐮 + 𝛆, (1) + + where: + + 𝛃₂ ~ 𝓝(𝟎, 𝓋₃𝙴₀𝙴₀ᵀ), + 𝐞 ~ 𝓝(𝟎, 𝓋₁ρ₁𝙴₁𝙴₁ᵀ), + 𝐮 ~ 𝓝(𝟎, 𝓋₁(1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ), and + 𝛆 ~ 𝓝(𝟎, 𝓋₂𝙸). + + 𝐠⊙𝛃 is a random effect term which models the GxC effect. + Additionally, W𝛂 models additive covariates and 𝐠𝛽₁ models persistent genetic effects. + Both are modelled as fixed effects. + On the other hand, 𝐞, 𝐮 and 𝛆 are modelled as random effects + 𝐞 is the environment effect, 𝐮 is a background term accounting for interactions between population structure + and environmental structure, and 𝛆 is the iid noise. + The full covariance of 𝐲 is therefore given by: + + cov(𝐲) = 𝓋₃𝙳𝙴₀𝙴₀ᵀ𝙳 + 𝓋₁ρ₁𝙴₁𝙴₁ᵀ + 𝓋₁(1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ + 𝓋₂𝙸, + + where 𝙳 = diag(𝐠). Its marginalised form is given by: + + 𝐲 ~ 𝓝(W𝛂 + 𝐠𝛽₁, 𝓋₃𝙳𝙴₀𝙴₀ᵀ𝙳 + 𝓋₁(ρ₁𝙴₁𝙴₁ᵀ + (1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ) + 𝓋₂𝙸). + + The CellRegMap method is used to perform an interaction test: + + The interaction test compares the following hypotheses (from Eq. 1): + + 𝓗₀: 𝓋₃ = 0 + 𝓗₁: 𝓋₃ > 0 + + 𝓗₀ denotes no GxE effects, while 𝓗₁ models the presence of GxE effects. + + """ + + def __init__(self, y, E, W=None, Ls=None, E1=None, hK=None): + self._y = asarray(y, float).flatten() + self._E0 = asarray(E, float) + Ls = [] if Ls is None else Ls + + if W is not None: + self._W = asarray(W, float) + else: + self._W = ones((self._y.shape[0], 1)) + + if E1 is not None: + self._E1 = asarray(E1, float) + else: + self._E1 = asarray(E, float) + + + self._Ls = list(asarray(L, float) for L in Ls) + + assert self._W.ndim == 2 + assert self._E0.ndim == 2 + assert self._E1.ndim == 2 + + assert self._y.shape[0] == self._W.shape[0] + assert self._y.shape[0] == self._E0.shape[0] + assert self._y.shape[0] == self._E1.shape[0] + + for L in Ls: + assert self._y.shape[0] == L.shape[0] + assert L.ndim == 2 + + self._null_lmm_assoc = {} + + self._halfSigma = {} + self._Sigma_qs = {} + # TODO: remove it after debugging + self._Sigma = {} + + # option to set different background (when Ls are defined, background is K*EEt + EEt) + if len(Ls) == 0: + # self._rho0 = [1.0] + if hK is None: # EEt only as background + self._rho1 = [1.0] + self._halfSigma[1.0] = self._E1 + self._Sigma_qs[1.0] = economic_qs_linear(self._E1, return_q1=False) + else: # hK is decomposition of K, background in this case is K + EEt + self._rho1 = linspace(0, 1, 11) + for rho1 in self._rho1: + a = sqrt(rho1) + b = sqrt(1 - rho1) + hS = concatenate([a * self._E1] + [b * hK], axis=1) + self._halfSigma[rho1] = hS + self._Sigma_qs[rho1] = economic_qs_linear( + self._halfSigma[rho1], return_q1=False + ) + else: + # self._rho0 = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] + self._rho1 = linspace(0, 1, 11) + for rho1 in self._rho1: + # Σ = ρ₁𝙴𝙴ᵀ + (1-ρ₁)𝙺⊙E + # concatenate((sqrt(rho1) * self._E, sqrt(1 - rho1) * G1), axis=1) + # self._Sigma[rho1] = rho1 * self._EE + (1 - rho1) * self._K + # self._Sigma_qs[rho1] = economic_qs(self._Sigma[rho1]) + a = sqrt(rho1) + b = sqrt(1 - rho1) + hS = concatenate([a * self._E1] + [b * L for L in Ls], axis=1) + self._halfSigma[rho1] = hS + self._Sigma_qs[rho1] = economic_qs_linear( + self._halfSigma[rho1], return_q1=False + ) + + @property + def n_samples(self): + return self._y.shape[0] + + def predict_interaction(self, G, MAF): + """ + Estimate effect sizes for a given set of SNPs + """ + # breakpoint() + G = asarray(G, float) + E0 = self._E0 + W = self._W + n_snps = G.shape[1] + beta_g_s = [] + beta_gxe_s = [] + + p = asarray(atleast_1d(MAF), float) + normalization = 1 / sqrt(2 * p * (1 - p)) + + for i in range(n_snps): + g = G[:, [i]] + # mean(𝐲) = W𝛂 + 𝐠𝛽₁ + 𝙴𝝲 = 𝙼𝛃 + M = concatenate((W, g, E0), axis=1) + gE = g * E0 + best = {"lml": -inf, "rho1": 0} + hSigma_p = {} + Sigma_qs = {} + for rho1 in self._rho1: + # Σ[ρ₁] = ρ₁(𝐠⊙𝙴)(𝐠⊙𝙴)ᵀ + (1-ρ₁)𝙺⊙EEᵀ + a = sqrt(rho1) + b = sqrt(1 - rho1) + hSigma_p[rho1] = concatenate( + [a * gE] + [b * L for L in self._Ls], axis=1 + ) + # ( + # (a * gE, b * self._G), axis=1 + # ) + # cov(𝐲) = 𝓋₁Σ[ρ₁] + 𝓋₂𝙸 + # lmm = Kron2Sum(Y, [[1]], M, hSigma_p[rho1], restricted=True) + Sigma_qs[rho1] = economic_qs_linear( + hSigma_p[rho1], return_q1=False + ) + lmm = LMM(self._y, M, Sigma_qs[rho1], restricted=True) + lmm.fit(verbose=False) + + if lmm.lml() > best["lml"]: + best["lml"] = lmm.lml() + best["rho1"] = rho1 + best["lmm"] = lmm + + # breakpoint() + lmm = best["lmm"] + # beta_g = 𝛽₁ + beta_g = lmm.beta[W.shape[1]] + # yadj = 𝐲 - 𝙼𝛃 + yadj = (self._y - lmm.mean()).reshape(self._y.shape[0], 1) + rho1 = best["rho1"] + v1 = lmm.v0 + v2 = lmm.v1 + hSigma_p_qs = economic_qs_linear(hSigma_p[rho1], return_q1=False) + qscov = QSCov(hSigma_p_qs[0][0], hSigma_p_qs[1], v1, v2) + # v = cov(𝐲)⁻¹(𝐲 - 𝙼𝛃) + v = qscov.solve(yadj) + + sigma2_gxe = v1 * rho1 + beta_gxe = sigma2_gxe * E0 @ (gE.T @ v) * normalization[i] + # beta_star = (beta_g * normalization + beta_gxe) + + beta_g_s.append(beta_g) + beta_gxe_s.append(beta_gxe) + + + return (asarray(beta_g_s), stack(beta_gxe_s).T) + + def estimate_aggregate_environment(self, g): + g = atleast_2d(g).reshape((g.size, 1)) + E0 = self._E0 + gE = g * E0 + W = self._W + M = concatenate((W, g, E0), axis=1) + best = {"lml": -inf, "rho1": 0} + hSigma_p = {} + for rho1 in self._rho1: + # Σₚ = ρ₁(𝐠⊙𝙴)(𝐠⊙𝙴)ᵀ + (1-ρ₁)𝙺⊙E + a = sqrt(rho1) + b = sqrt(1 - rho1) + hSigma_p[rho1] = concatenate([a * gE] + [b * L for L in self._Ls], axis=1) + # cov(𝐲) = 𝓋₁Σₚ + 𝓋₂𝙸 + # lmm = Kron2Sum(Y, [[1]], M, hSigma_p[rho1], restricted=True) + QS = self._Sigma_qs[rho1] + lmm = LMM(self._y, M, QS, restricted=True) + lmm.fit(verbose=False) + + if lmm.lml() > best["lml"]: + best["lml"] = lmm.lml() + best["rho1"] = rho1 + best["lmm"] = lmm + + lmm = best["lmm"] + yadj = self._y - lmm.mean() + # rho1 = best["rho1"] + v1 = lmm.v0 + v2 = lmm.v1 + rho1 = best["rho1"] + sigma2_gxe = rho1 * v1 + hSigma_p_qs = economic_qs_linear(hSigma_p[rho1], return_q1=False) + qscov = QSCov(hSigma_p_qs[0][0], hSigma_p_qs[1], v1, v2) + # v = cov(𝐲)⁻¹yadj + v = qscov.solve(yadj) + beta_gxe = sigma2_gxe * gE.T @ v + + return E0 @ beta_gxe + + def scan_association(self, G): + info = {"rho1": [], "e2": [], "g2": [], "eps2": []} + + # NULL model + best = {"lml": -inf, "rho1": 0} + for rho1 in self._rho1: + QS = self._Sigma_qs[rho1] + # LRT for fixed effects requires ML rather than REML estimation + lmm = LMM(self._y, self._W, QS, restricted=False) + lmm.fit(verbose=False) + + if lmm.lml() > best["lml"]: + best["lml"] = lmm.lml() + best["rho1"] = rho1 + best["lmm"] = lmm + + null_lmm = best["lmm"] + info["rho1"].append(best["rho1"]) + info["e2"].append(null_lmm.v0 * best["rho1"]) + info["g2"].append(null_lmm.v0 * (1 - best["rho1"])) + info["eps2"].append(null_lmm.v1) + + n_snps = G.shape[1] + alt_lmls = [] + for i in tqdm(range(n_snps)): + g = G[:, [i]] + X = concatenate((self._W, g), axis=1) + QS = self._Sigma_qs[best["rho1"]] + alt_lmm = LMM(self._y, X, QS, restricted=False) + alt_lmm.fit(verbose=False) + alt_lmls.append(alt_lmm.lml()) + + pvalues = lrt_pvalues(null_lmm.lml(), alt_lmls, dof=1) + + info = {key: asarray(v, float) for key, v in info.items()} + return asarray(pvalues, float), info + + + def scan_association_fast(self, G): + info = {"rho1": [], "e2": [], "g2": [], "eps2": []} + + # NULL model + best = {"lml": -inf, "rho1": 0} + for rho1 in self._rho1: + QS = self._Sigma_qs[rho1] + # LRT for fixed effects requires ML rather than REML estimation + lmm = LMM(self._y, self._W, QS, restricted=False) + lmm.fit(verbose=False) + + if lmm.lml() > best["lml"]: + best["lml"] = lmm.lml() + best["rho1"] = rho1 + best["lmm"] = lmm + + null_lmm = best["lmm"] + info["rho1"].append(best["rho1"]) + info["e2"].append(null_lmm.v0 * best["rho1"]) + info["g2"].append(null_lmm.v0 * (1 - best["rho1"])) + info["eps2"].append(null_lmm.v1) + + # Alternative model + lmm = null_lmm + flmm = lmm.get_fast_scanner() + alt_lmls = flmm.fast_scan(G, verbose=False)['lml'] + + pvalues = lrt_pvalues(null_lmm.lml(), alt_lmls, dof=1) + + info = {key: asarray(v, float) for key, v in info.items()} + return asarray(pvalues, float), info + + + def scan_interaction( + self, G, idx_E: Optional[any] = None, idx_G: Optional[any] = None + ): + """ + 𝐲 = W𝛂 + 𝐠𝛽₁ + 𝐠⊙𝛃₂ + 𝐞 + 𝐮 + 𝛆 + [fixed=X] [H1] + + 𝛃₂ ~ 𝓝(𝟎, 𝓋₃𝙴₀𝙴₀ᵀ), + 𝐞 ~ 𝓝(𝟎, 𝓋₁ρ₁𝙴₁𝙴₁ᵀ), + 𝐮 ~ 𝓝(𝟎, 𝓋₁(1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ), and + 𝛆 ~ 𝓝(𝟎, 𝓋₂𝙸). + + 𝓗₀: 𝓋₃ = 0 + 𝓗₁: 𝓋₃ > 0 + """ + # TODO: make sure G is nxp + from chiscore import davies_pvalue + + G = asarray(G, float) + n_snps = G.shape[1] + pvalues = [] + info = {"rho1": [], "e2": [], "g2": [], "eps2": []} + + for i in tqdm(range(n_snps)): + g = G[:, [i]] + X = concatenate((self._W, g), axis=1) + best = {"lml": -inf, "rho1": 0} + # Null model fitting: find best (𝛂, 𝛽₁, 𝓋₁, 𝓋₂, ρ₁) + for rho1 in self._rho1: + # QS = self._Sigma_qs[rho1] + # halfSigma = self._halfSigma[rho1] + # Σ = ρ₁𝙴𝙴ᵀ + (1-ρ₁)𝙺⊙E + # cov(y₀) = 𝓋₁Σ + 𝓋₂I + QS = self._Sigma_qs[rho1] + lmm = LMM(self._y, X, QS, restricted=True) + lmm.fit(verbose=False) + + if lmm.lml() > best["lml"]: + best["lml"] = lmm.lml() + best["rho1"] = rho1 + best["lmm"] = lmm + + lmm = best["lmm"] + # H1 via score test + # Let K₀ = e²𝙴𝙴ᵀ + g²𝙺⊙E + 𝜀²I + # e²=𝓋₁ρ₁ + # g²=𝓋₁(1-ρ₁) + # 𝜀²=𝓋₂ + # with optimal values 𝓋₁ and 𝓋₂ found above. + info["rho1"].append(best["rho1"]) + info["e2"].append(lmm.v0 * best["rho1"]) + info["g2"].append(lmm.v0 * (1 - best["rho1"])) + info["eps2"].append(lmm.v1) + # QS = economic_decomp( Σ(ρ₁) ) + Q0 = self._Sigma_qs[best["rho1"]][0][0] + S0 = self._Sigma_qs[best["rho1"]][1] + # e2 = best["lmm"].v0 * best["rho1"] + # g2 = best["lmm"].v0 * (1 - best["rho1"]) + # eps2 = best["lmm"].v1 + # EE = self._E @ self._E.T + # K = self._G @ self._G.T + # K0 = e2 * EE + g2 * K + eps2 * eye(K.shape[0]) + qscov = QSCov( + Q0, + S0, + lmm.v0, # 𝓋₁ + lmm.v1, # 𝓋₂ + ) + # start = time() + # qscov = QSCov(self._Sigma_qs[best["rho1"]], lmm.C0[0, 0], lmm.C1[0, 0]) + # print(f"Elapsed: {time() - start}") + # X = concatenate((self._E, g), axis=1) + # X = concatenate((self._W, g), axis=1) + + # Let P₀ = K₀⁻¹ - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹. + P = PMat(qscov, X) + # P0 = inv(K0) - inv(K0) @ X @ inv(X.T @ inv(K0) @ X) @ X.T @ inv(K0) + + # P₀𝐲 = K₀⁻¹𝐲 - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹𝐲. + + # Useful for permutation + if idx_E is None: + E0 = self._E0 + else: + E0 = self._E0[idx_E, :] + + # The covariance matrix of H1 is K = K₀ + 𝓋₃diag(𝐠)⋅𝙴𝙴ᵀ⋅diag(𝐠) + # We have ∂K/∂𝓋₃ = diag(𝐠)⋅𝙴𝙴ᵀ⋅diag(𝐠) + # The score test statistics is given by + # Q = ½𝐲ᵀP₀⋅∂K⋅P₀𝐲 + # start = time() + + # Useful for permutation + if idx_G is None: + gtest = g.ravel() + else: + gtest = g.ravel()[idx_G] + + ss = ScoreStatistic(P, qscov, ddot(gtest, E0)) + Q = ss.statistic(self._y) + # import numpy as np + + # deltaK = np.diag(gtest) @ EE @ np.diag(gtest) + # Q_ = 0.5 * self._y.T @ P0 @ deltaK @ P0 @ self._y + # print(f"Elapsed: {time() - start}") + # Q is the score statistic for our interaction test and follows a linear + # combination + # of chi-squared (df=1) distributions: + # Q ∼ ∑λχ², where λᵢ are the non-zero eigenvalues of ½√P₀⋅∂K⋅√P₀. + # Since eigenvals(𝙰𝙰ᵀ) = eigenvals(𝙰ᵀ𝙰) (TODO: find citation), + # we can compute ½(√∂K)P₀(√∂K) instead. + # start = time() + # import scipy as sp + # sqrtm = sp.linalg.sqrtm + # np.linalg.eigvalsh(0.5 * sqrtm(P0) @ deltaK @ sqrtm(P0)) + # np.linalg.eigvalsh(0.5 * sqrtm(deltaK) @ P0 @ sqrtm(deltaK)) + # TODO: compare with Liu approximation, maybe try a computational intensive + # method + pval, pinfo = davies_pvalue(Q, ss.matrix_for_dist_weights(), True) + pvalues.append(pval) + # print(f"Elapsed: {time() - start}") + + info = {key: asarray(v, float) for key, v in info.items()} + return asarray(pvalues, float), info + + +def lrt_pvalues(null_lml, alt_lmls, dof=1): + """ + Compute p-values from likelihood ratios. + + These are likelihood ratio test p-values. + + Parameters + ---------- + null_lml : float + Log of the marginal likelihood under the null hypothesis. + alt_lmls : array_like + Log of the marginal likelihoods under the alternative hypotheses. + dof : int + Degrees of freedom. + + Returns + ------- + pvalues : ndarray + P-values. + """ + from numpy import clip + from numpy_sugar import epsilon + from scipy.stats import chi2 + + lrs = clip(-2 * null_lml + 2 * asarray(alt_lmls, float), epsilon.super_tiny, inf) + pv = chi2(df=dof).sf(lrs) + return clip(pv, epsilon.super_tiny, 1 - epsilon.tiny) + +def run_association(y, W, E, G, hK=None): + """ + Association test. + + Test for persistent genetic effects. + + Compute p-values using a likelihood ratio test. + + Parameters + ---------- + y : array + Phenotype + W : array + Fixed effect covariates + E : array + Cellular contexts + G : array + Genotypes (expanded) + hK : array + decompositon of kinship matrix (expanded) + + Returns + ------- + pvalues : ndarray + P-values. + """ + if hK is None: hK = None + crm = CellRegMap(y, W, E, hK=hK) + pv = crm.scan_association(G) + return pv + +def run_association_fast(y, W, E, G, hK=None): + """ + Association test. + + Test for persistent genetic effects. + + Compute p-values using a likelihood ratio test. + + Parameters + ---------- + y : array + Phenotype + W : array + Fixed effect covariates + E : array + Cellular contexts + G : array + Genotypes (expanded) + hK : array + decompositon of kinship matrix (expanded) + + Returns + ------- + pvalues : ndarray + P-values. + """ + if hK is None: hK = None + crm = CellRegMap(y, W, E, hK=hK) + pv = crm.scan_association_fast(G) + return pv + +def get_L_values(hK, E): + """ + As the definition of Ls is not particulatly intuitive, + function to extract list of L values given kinship K and + cellular environments E + """ + # get eigendecomposition of EEt + [U, S, _] = economic_svd(E) + us = U * S + + # get decomposition of K \odot EEt + Ls = [ddot(us[:,i], hK) for i in range(us.shape[1])] + return Ls + +def run_interaction(y, E, G, W=None, E1=None, E2=None, hK=None, idx_G=None): + """ + Interaction test. + + Test for cell-level genetic effects due to GxC interactions. + + Compute p-values using a score test. + + Parameters + ---------- + y : array + Phenotype + E : array + Cellular contexts (GxC component) + G : array + Genotypes (expanded) + W : array + Fixed effect covariates + hK : array + decompositon of kinship matrix (expanded) + E1 : array + Cellular contexts (C component) + E2 : array + Cellular contexts (K*C component) + idx_G : array + Permuted genotype index + + Returns + ------- + pvalues : ndarray + P-values. + """ + if E1 is None: E1 = E + else: E1 = E1 + if E2 is None: E2 = E + else: E2 = E2 + if hK is None: Ls = None + else: Ls = get_L_values(hK, E2) + crm = CellRegMap(y=y, E=E, W=W, E1=E1, Ls=Ls) + pv = crm.scan_interaction(G, idx_G) + return pv + +def compute_maf(X): + r"""Compute minor allele frequencies. + It assumes that ``X`` encodes 0, 1, and 2 representing the number + of alleles (or dosage), or ``NaN`` to represent missing values. + Parameters + ---------- + X : array_like + Genotype matrix. + Returns + ------- + array_like + Minor allele frequencies. + Examples + -------- + .. doctest:: + >>> from numpy.random import RandomState + >>> from limix.qc import compute_maf + >>> + >>> random = RandomState(0) + >>> X = random.randint(0, 3, size=(100, 10)) + >>> + >>> print(compute_maf(X)) # doctest: +FLOAT_CMP + [0.49 0.49 0.445 0.495 0.5 0.45 0.48 0.48 0.47 0.435] + """ + import dask.array as da + import xarray as xr + from pandas import DataFrame + from numpy import isnan, logical_not, minimum, nansum + if isinstance(X, da.Array): + s0 = da.nansum(X, axis=0).compute() + denom = 2 * (X.shape[0] - da.isnan(X).sum(axis=0)).compute() + elif isinstance(X, DataFrame): + s0 = X.sum(axis=0, skipna=True) + denom = 2 * logical_not(X.isna()).sum(axis=0) + elif isinstance(X, xr.DataArray): + if "sample" in X.dims: + kwargs = {"dim": "sample"} + else: + kwargs = {"axis": 0} + s0 = X.sum(skipna=True, **kwargs) + denom = 2 * logical_not(isnan(X)).sum(**kwargs) + else: + s0 = nansum(X, axis=0) + denom = 2 * logical_not(isnan(X)).sum(axis=0) + s0 = s0 / denom + s1 = 1 - s0 + maf = minimum(s0, s1) + if hasattr(maf, "name"): + maf.name = "maf" + return maf + +def estimate_betas(y, W, E, G, maf=None, E1=None, E2=None, hK=None): + """ + Effect sizes estimator + + Estimates cell-level genetic effects due to GxC + as well as persistent genetic effects across all cells. + + Parameters + ---------- + y : array + Phenotype + W : array + Fixed effect covariates + E : array + Cellular contexts + G : array + Genotypes (expanded) + maf: array + Minor allele frequencies (MAFs) for the SNPs in G + hK : array + decompositon of kinship matrix (expanded) + E1 : array + Cellular contexts (C component) + E2 : array + Cellular contexts (K*C component) + + Returns + ------- + betas : ndarray + estimated effect sizes, both persistent and due to GxC. + """ + if E1 is None: E1 = E + else: E1 = E1 + if E2 is None: E2 = E + else: E2 = E2 + if hK is None: Ls = None + else: Ls = get_L_values(hK, E2) + crm = CellRegMap(y=y, E=E, W=W, E1=E1, Ls=Ls) + if maf is None: + maf = compute_maf(G) + # print("MAFs: {}".format(maf)) + betas = crm.predict_interaction(G, maf) + return betas diff --git a/cellregmap/_math.py b/cellregmap/_math.py new file mode 100644 index 0000000..fc56572 --- /dev/null +++ b/cellregmap/_math.py @@ -0,0 +1,256 @@ +""" +Mathematical definitions +------------------------ + +This module defines some core mathematical concepts which CellRegMap depends on. +The implementations provided here are meant to help understand those concepts and to provide test cases. + +We assume the definition + + 𝐲 ∼ 𝓝(𝚆, 𝙺) + +throughout this module. + +References +---------- +.. [1] Lippert, C., Xiang, J., Horta, D., Widmer, C., Kadie, C., Heckerman, D., + & Listgarten, J. (2014). Greater power and computational efficiency for + kernel-based association testing of sets of genetic variants. Bioinformatics, + 30(22), 3206-3214. +.. [2] Liu, H., Tang, Y., & Zhang, H. H. (2009). A new chi-square approximation + to the distribution of non-negative definite quadratic forms in non-central + normal variables. Computational Statistics & Data Analysis, 53(4), 853-856. +.. [3] Lee, Seunggeun, Michael C. Wu, and Xihong Lin. "Optimal tests for rare + variant effects in sequencing association studies." Biostatistics 13.4 (2012): + 762-775. +""" +from numpy import finfo, logical_not, sqrt +from numpy.linalg import eigh, eigvalsh, inv, lstsq, solve, svd +from numpy_sugar import ddot +from scipy.linalg import sqrtm + + +def rsolve(a, b): + """ + Robust solver. + """ + return lstsq(a, b, rcond=None)[0] + + +class QSCov: + """ + Represents 𝑎𝙺 + 𝑏𝙸. + + 𝙺 matrix is defined by its eigen decomposition, `QS`. + """ + + def __init__(self, Q0, S0, a=1.0, b=1.0): + self._Q0 = Q0 + self._S0 = S0 + self._a = a + self._b = b + + def dot(self, v): + left = self._a * self._Q0 @ ddot(self._S0, self._Q0.T @ v, left=True) + right = self._b * v + return left + right + + def solve(self, v): + # nrows = self._Q0.shape[0] + # rank = self._Q0.shape[1] + + # tmp = ones(nrows) + # tmp[:rank] += (self._a / self._b) * self._S0 + # R = 1 / tmp + + # R0 = R[:rank] + # R1 = R[rank:] # Always ones + + # tmp = (self._a / self._b) * self._S0 + R0 = 1 / (1 + ((self._a / self._b) * self._S0)) + # R0 = R[:rank] + Q0v = self._Q0.T @ v + return (self._Q0 @ ddot(R0, Q0v, left=True) + v - self._Q0 @ Q0v) / self._b + # left = self._Q0 @ ddot(R0, self._Q0.T @ v, left=True) + # right = self._Q1 @ ddot(R1, self._Q1.T @ v, left=True) + # return (left + right) / self._b + + +class PMat: + """ + Represents 𝙿 = 𝙺⁻¹ - 𝙺⁻¹𝚆(𝚆ᵀ𝙺⁻¹𝚆)⁻¹𝚆ᵀ𝙺⁻¹. + + The 𝙺 is defined via an `QSCov` object. + """ + + def __init__(self, qscov: QSCov, W): + self._qscov = qscov + self._W = W + self._KiW = self._qscov.solve(self._W) + + def dot(self, v): + Kiv = self._qscov.solve(v) + return Kiv - self._KiW @ rsolve(self._W.T @ self._KiW, self._KiW.T @ v) + + +def P_matrix(W, K): + """ Computes 𝙿 = 𝙺⁻¹ - 𝙺⁻¹𝚆(𝚆ᵀ𝙺⁻¹𝚆)⁻¹𝚆ᵀ𝙺⁻¹. """ + KiW = solve(K, W) + return inv(K) - KiW @ solve(W.T @ KiW, KiW.T) + + +class ScoreStatistic: + """ + Score-test statistic [1]_ is given by + + 𝑄 = ½𝐲ᵀ𝙿(∂𝙺)𝙿𝐲. + """ + + def __init__(self, P: PMat, K: QSCov, sqrt_dK): + self._P = P + self._K = K + self._sqrt_dK = sqrt_dK + + def statistic(self, y): + """ Compute 𝑄 = ½𝐲ᵀ𝙿(∂𝙺)𝙿𝐲. """ + Py = self._P.dot(y) + return Py.T @ self._sqrt_dK @ self._sqrt_dK.T @ Py / 2 + + def matrix_for_dist_weights(self): + """Compute ½(√∂𝙺)𝙿(√∂𝙺). + + The returned matrix has its eigenvalues equal to the eigenvalues of ½√𝙿(∂𝙺)√𝙿. + """ + return self._sqrt_dK.T @ self._P.dot(self._sqrt_dK) / 2 + + def distr_weights(self): + weights = eigvalsh(self.matrix_for_dist_weights()) + return weights[weights > 1e-16] + + +def score_statistic(y, W, K, dK): + """ + Score-test statistic [1]_ is given by + + 𝑄 = ½𝐲ᵀ𝙿(∂𝙺)𝙿𝐲. + """ + P = P_matrix(W, K) + return y.T @ P @ dK @ P @ y / 2 + + +def score_statistic_qs(y, W, qscov, dK): + """ + Same as `score_statistic`. + """ + P = PMat(qscov, W) + Py = P.dot(y) + return Py.T @ dK @ Py / 2 + + +def score_statistic_distr_weights(W, K, dK): + """ + Score-test statistic follows a weighted sum of random variables [1]_: + + 𝑄 ∼ ∑ᵢ𝜆ᵢχ²(1), + + where 𝜆ᵢ are the non-zero eigenvalues of ½√𝙿(∂𝙺)√𝙿. + """ + P = P_matrix(W, K) + weights = eigvalsh(sqrtm(P) @ dK @ sqrtm(P)) / 2 + return weights[weights > 1e-16] + + +def score_statistic_liu_params(q, weights): + """ + Computes Pr(𝑄 > q) for 𝑄 ∼ ∑ᵢ𝜆ᵢχ²(1) using a modification [3]_ of the original Liu + survival function approximation [2]_. This function also returns estimated + parameters, not yet fully explained here. + """ + from chiscore import liu_sf + + n = len(weights) + # We use the Liu survival function to approximate the distribution followed by a + # linear combination of noncentral chi-squared variables (Q) using only three + # parameters of such distribution: the weights, degrees of freedom, and + # noncentrality (Qh). 𝑄 ∼ ∑λᵢχ²(hᵢ, 𝛿ᵢ), + # where λᵢ, hᵢ, and 𝛿ᵢ are the weights, degrees of freedom (1), and noncentral + # (0) parameters. By setting the last input to True we use the modified version + # [REF]. + (pv, dof_x, _, info) = liu_sf(q, weights, [1] * n, [0] * n, True) + return {"pv": pv, "mu_q": info["mu_q"], "sigma_q": info["sigma_q"], "dof_x": dof_x} + + +def qmin(liu_params): + import scipy.stats as st + from numpy import zeros + + n = len(liu_params) + + # T statistic + T = min(i["pv"] for i in liu_params) + + qmin = zeros(n) + percentile = 1 - T + for i in range(n): + q = st.chi2.ppf(percentile, liu_params[i]["dof_x"]) + mu_q = liu_params[i]["mu_q"] + sigma_q = liu_params[i]["sigma_q"] + dof = liu_params[i]["dof_x"] + qmin[i] = (q - dof) / (2 * dof) ** 0.5 * sigma_q + mu_q + + return qmin + + +def economic_qs(K, epsilon=sqrt(finfo(float).eps)): + r"""Economic eigen decomposition for symmetric matrices. + + A symmetric matrix ``K`` can be decomposed in + :math:`\mathrm Q_0 \mathrm S_0 \mathrm Q_0^\intercal + \mathrm Q_1\ + \mathrm S_1 \mathrm Q_1^ \intercal`, where :math:`\mathrm S_1` is a zero + matrix with size determined by ``K``'s rank deficiency. + + Args: + K (array_like): Symmetric matrix. + epsilon (float): Eigen value threshold. Default is + ``sqrt(finfo(float).eps)``. + + Returns: + tuple: ``((Q0, Q1), S0)``. + """ + + (S, Q) = eigh(K) + + nok = abs(max(Q[0].min(), Q[0].max(), key=abs)) < epsilon + nok = nok and abs(max(K.min(), K.max(), key=abs)) >= epsilon + if nok: + from scipy.linalg import eigh as sp_eigh + + (S, Q) = sp_eigh(K) + + ok = S >= epsilon + nok = logical_not(ok) + S0 = S[ok] + Q0 = Q[:, ok] + Q1 = Q[:, nok] + return ((Q0, Q1), S0) + + +def economic_qs_linear(G): + r"""Economic eigen decomposition for symmetric matrices ``dot(G, G.T)``. + + It is theoretically equivalent to ``economic_qs(dot(G, G.T))``. + Refer to :func:`numpy_sugar.economic_qs` for further information. + + Args: + G (array_like): Matrix. + + Returns: + tuple: ``((Q0, Q1), S0)``. + """ + if G.shape[0] > G.shape[1]: + (Q0, Ssq, _) = svd(G, full_matrices=False) + S0 = Ssq ** 2 + return (Q0, S0) + + Q, S0 = economic_qs(G.dot(G.T)) + return (Q[0], S0) diff --git a/cellregmap/_simulate.py b/cellregmap/_simulate.py new file mode 100644 index 0000000..0665a49 --- /dev/null +++ b/cellregmap/_simulate.py @@ -0,0 +1,479 @@ +from collections import namedtuple +from typing import List, Union + +from numpy import ( + array_split, + asarray, + cumsum, + errstate, + eye, + isscalar, + ones, + repeat, + split, + sqrt, + stack, + zeros, +) +from numpy.random import Generator +from numpy_sugar import ddot +from numpy_sugar.linalg import economic_svd + +from ._types import Term + +Variances = namedtuple("Variances", "g gxe k e n") +Simulation = namedtuple( + "Simulation", "mafs y offset beta_g y_g y_gxe y_k y_e y_n variances G E Lk Ls K M" +) +SimulationFixedGxE = namedtuple( + "Simulation", + "mafs y offset beta_g beta_gxe beta_e y_g y_gxe y_k y_e y_n variances G E Lk K M", +) + + +def sample_maf(n_snps: int, maf_min: float, maf_max: float, random: Generator): + assert maf_min <= maf_max and maf_min >= 0 and maf_max <= 1 + return random.random(n_snps) * (maf_max - maf_min) + maf_min + + +def sample_genotype(n_samples: int, mafs, random): + G = [] + mafs = asarray(mafs, float) + for maf in mafs: + probs = [(1 - maf) ** 2, 1 - ((1 - maf) ** 2 + maf ** 2), maf ** 2] + g = random.choice([0, 1, 2], p=probs, size=n_samples) + G.append(asarray(g, float)) + + return stack(G, axis=1) + + +def column_normalize(X): + X = asarray(X, float) + + with errstate(divide="raise", invalid="raise"): + return (X - X.mean(0)) / X.std(0) + + +def create_environment_matrix( + n_samples: int, n_env: int, groups: List[List[int]], random: Generator +): + E = random.normal(size=[n_samples, n_env]) + E = column_normalize(E) + EE = E @ E.T + EE /= EE.diagonal().mean() + H = sample_covariance_matrix(n_samples, groups)[1] + M = EE + H + M /= M.diagonal().mean() + jitter(M) + return _symmetric_decomp(M) + + +def create_environment_vector( + n_samples: int, groups: List[List[int]], random: Generator +): + E = zeros((n_samples, 1)) + + values = random.choice([-1, 1], 2, False) + for value, group in zip(values, groups): + E[group, 0] = value + + return E + + +def sample_covariance_matrix(n_samples: int, groups: List[List[int]]): + X = zeros((n_samples, len(groups))) + + for i, idx in enumerate(groups): + X[idx, i] = 1.0 + + K = X @ X.T + K /= K.diagonal().mean() + jitter(K) + + return (_symmetric_decomp(K), K) + + +def jitter(K): + with errstate(divide="raise", invalid="raise"): + # This small diagonal offset is to guarantee the full-rankness. + K += 1e-8 * eye(K.shape[0]) + + return K + + +def create_variances(r0, v0, has_kinship=True) -> Variances: + """ + Remember that: + + cov(𝐲) = 𝓋₀(1-ρ₀)𝙳𝟏𝟏ᵀ𝙳 + 𝓋₀ρ₀𝙳𝙴𝙴ᵀ𝙳 + 𝓋₁ρ₁EEᵀ + 𝓋₁(1-ρ₁)𝙺 + 𝓋₂𝙸. + + Let us define: + + σ²_g = 𝓋₀(1-ρ₀) (variance explained by persistent genetic effects) + σ²_gxe = 𝓋₀ρ₀ (variance explained by GxE effects) + + σ²_e = 𝓋₁ρ₁ (variance explained by environmental effects) + σ²_k = 𝓋₁(1-ρ₁) (variance explained by population structure) + σ²_n = 𝓋₂ (residual variance, noise) + + We set the total variance to sum up to 1: + + 1 = σ²_g + σ²_gxe + σ²_e + σ²_k + σ²_n + + We set the variances explained by the non-genetic terms to be equal: + + v = σ²_e = σ²_k = σ²_n + + For `has_kinship=False`, we instead set the variances such that: + + v = σ²_e = σ²_n + + Parameters + ---------- + r0 : float + This is ρ₀. + v0 : float + This is 𝓋₀. + """ + v_g = v0 * (1 - r0) + v_gxe = v0 * r0 + + v_k = 0.0 + if has_kinship: + v = (1 - v_gxe - v_g) / 3 + v_e = v + v_k = v + v_n = v + else: + v = (1 - v_gxe - v_g) / 2 + v_e = v + v_n = v + + variances = {"g": v_g, "gxe": v_gxe, "e": v_e, "n": v_n} + if has_kinship: + variances["k"] = v_k + else: + variances["k"] = None + + return Variances(**variances) + + +def sample_persistent_effsizes( + n_effects: int, causal_indices: list, variance: float, random: Generator +): + """ + Let 𝚓 denote a sample index and 𝚔 denote a SNP index. Let 𝚟ⱼ = 𝐠ⱼᵀ𝛃. + We assume that 𝑔ⱼₖ is a random variable such that: + + 𝔼[𝑔ⱼₖ] = 0 + 𝔼[𝑔ⱼₖ²] = 1 + + And we also assume that SNPs are uncorrelated from each other: 𝔼[𝑔ⱼₖ⋅𝑔ⱼᵣ] = 0 + for 𝚔≠𝚛. + Assuming that 𝛃 is given (fixed), we want to simulate 𝛃 such that: + + 𝔼[𝚟ⱼ] = 𝔼[∑ₖ𝑔ⱼₖ𝛽ₖ] = ∑ₖ𝔼[𝑔ⱼₖ]𝛽ₖ = 0 + 𝔼[𝚟ⱼ²] = 𝔼[(∑ₖ𝑔ⱼₖ𝛽ₖ)²] = ∑ₖ𝔼[𝑔ⱼₖ²]𝛽ₖ² = ∑ₖ𝛽ₖ² = 𝓋. + + Let 𝚒 denote a causal index. We initialize 𝛃←𝟎 and then randomly set 𝛽ᵢϵ{-1,+1} for + the causal SNPs. At last, we set 𝛃←𝛃×√(𝓋/𝘯) where 𝘯 is the number of causal SNPs. + This way we have ∑ₖ𝛽ₖ² = 𝓋. + + Parameters + ---------- + n_effects : int + Number of effects. + causal_indices : list + List of causal SNPs. + variance : float + Correspond to 𝓋. + """ + n_causals = len(causal_indices) + + effsizes = zeros(n_effects) + if variance == 0.0: + return effsizes + + effsizes[causal_indices] = random.choice([+1, -1], size=len(causal_indices)) + with errstate(divide="raise", invalid="raise"): + effsizes *= sqrt(variance / n_causals) + + return effsizes + + +def sample_persistent_effects(X, effsizes, variance: float): + y_g = X @ effsizes + if variance > 0: + _ensure_moments(y_g, 0, variance) + return y_g + + +def sample_gxe_effects(G, E, causal_indices: list, variance: float, random: Generator): + """ + Let 𝚒 denote a SNP index and 𝚓 denote an environment. + Let 𝑦₂ = ∑ᵢ(𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ) be the total GxE effect with + + 𝜶ᵢ ∼ 𝓝(𝟎, 𝜎ᵢ²I) + + for every SNP ᵢ. + We have + + 𝔼[𝑦₂] = ∑ᵢ𝔼[𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ] = ∑ᵢ𝔼[𝑔ᵢ]𝔼[𝛜ᵀ𝜶ᵢ] = ∑ᵢ0⋅𝔼[𝛜ᵀ𝜶ᵢ] = 0, + + where 𝑔ᵢ and 𝛜ᵀ𝜶ᵢ are assumed to be uncorrelated. + + We also have + + 𝔼[𝑦₂²] = 𝔼[(∑ᵢ𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ)²] = ∑ᵢ∑ⱼ𝔼[𝜖ⱼ²]𝔼[𝛼ᵢⱼ²] = ∑ᵢ𝜎ᵢ² = 𝜎², + + after a couple of assumptions. + + We define 𝜎ᵢ²=𝑣ᵢ if 𝑔ᵢ is causal and 𝜎ᵢ²=0 otherwise. We assume all causal SNPs + to have equal effect as defined by 𝑣ᵢ=𝜎²/𝑛₂, where 𝑛₂ is the number of SNPs + having GxE effects. + + We also assume that 𝔼[𝜖ⱼ]=0 and 𝔼[𝜖ⱼ²]=1/𝑛ₑ for every environment 𝚓. + """ + n_samples = G.shape[0] + n_envs = E.shape[1] + n_causals = len(causal_indices) + + y2 = zeros(n_samples) + if variance == 0.0: + return y2 + + vi = variance / n_causals + for causal in causal_indices: + # 𝜶ᵢ ∼ 𝓝(𝟎, 𝜎ᵢ²I) + alpha = sqrt(vi) * random.normal(size=n_envs) + + # Make the sample statistics close to population + # statistics + if n_envs > 1: + _ensure_moments(alpha, 0, sqrt(vi)) + + # 𝜷 = 𝛜ᵀ𝜶ᵢ + beta = E @ alpha + + # 𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ + y2 += G[:, causal] * beta + + _ensure_moments(y2, 0, variance) + + return y2 + + +# def sample_environment_effects(E, variance: float, random): +# from numpy import sqrt + +# n_envs = E.shape[1] +# effsizes = sqrt(variance) * random.randn(n_envs) +# y3 = E @ effsizes + +# _ensure_moments(y3, 0, variance) + +# return y3 + + +# def sample_random_effect(K, variance: float, random: Generator): +# y = random.multivariate_normal(zeros(K.shape[0]), K, method="cholesky") + +# _ensure_moments(y, 0, variance) + +# return y + +def _sample_random_effect(X, variance: float, random: Generator): + u = sqrt(variance) * random.normal(size=X.shape[1]) + y = X @ u + + _ensure_moments(y, 0, variance) + + return y + + +def sample_random_effect(X, variance: float, random: Generator): + if not isinstance(X, tuple): + return _sample_random_effect(X, variance, random) + + n = X[0].shape[0] + y = zeros(n) + for L in X: + u = sqrt(variance) * random.normal(size=L.shape[1]) + y += L @ u + _ensure_moments(y, 0, variance) + + return y + + +def sample_noise_effects(n_samples: int, variance: float, random: Generator): + y5 = sqrt(variance) * random.normal(size=n_samples) + _ensure_moments(y5, 0, variance) + + return y5 + + +def sample_phenotype_gxe( + offset: float, + n_individuals: int, + n_snps: int, + n_cells: Union[int, List[int]], + n_env_groups: int, + maf_min: float, + maf_max: float, + g_causals: list, + gxe_causals: list, + variances: Variances, + random: Generator, + env_term: Term = Term.RANDOM, +) -> Simulation: + """ + Parameters + ---------- + n_cells + Integer number of array of integers. + """ + mafs = sample_maf(n_snps, maf_min, maf_max, random) + + G = sample_genotype(n_individuals, mafs, random) + G = repeat(G, n_cells, axis=0) + G = column_normalize(G) + + n_samples = G.shape[0] + + if isscalar(n_cells): + individual_groups = array_split(range(n_samples), n_individuals) + else: + individual_groups = split(range(n_samples), cumsum(n_cells))[:-1] + + env_groups = array_split(random.permutation(range(n_samples)), n_env_groups) + + E = sample_covariance_matrix(n_samples, env_groups)[0] + + Lk, K = sample_covariance_matrix(n_samples, individual_groups) + [U, S, _] = economic_svd(E) + us = U * S + Ls = tuple([ddot(us[:, i], Lk) for i in range(us.shape[1])]) + + beta_g = sample_persistent_effsizes(n_snps, g_causals, variances.g, random) + y_g = sample_persistent_effects(G, beta_g, variances.g) + + y_gxe = sample_gxe_effects(G, E, gxe_causals, variances.gxe, random) + + y_k = sample_random_effect(Ls, variances.k, random) + + if env_term is Term.RANDOM: + y_e = sample_random_effect(E, variances.e, random) + elif env_term is Term.FIXED: + n = E.shape[1] + beta_e = sample_persistent_effsizes(n, list(range(n)), variances.e, random) + y_e = sample_persistent_effects(E, beta_e, variances.e) + else: + raise ValueError("Invalid term.") + + y_n = sample_noise_effects(n_samples, variances.n, random) + + M = ones((K.shape[0], 1)) + y = offset + y_g + y_gxe + y_k + y_e + y_n + + simulation = Simulation( + mafs=mafs, + offset=offset, + beta_g=beta_g, + y_g=y_g, + y_gxe=y_gxe, + y_k=y_k, + y_e=y_e, + y_n=y_n, + y=y, + variances=variances, + Lk=Lk, + Ls=Ls, + K=K, + E=E, + G=G, + M=M, + ) + + return simulation + + +def sample_phenotype( + offset: float, + n_individuals: int, + n_snps: int, + n_cells: Union[int, List[int]], + n_env: int, + n_env_groups: int, + maf_min: float, + maf_max: float, + g_causals: list, + gxe_causals: list, + variances: Variances, + random: Generator, +) -> Simulation: + """ + Parameters + ---------- + n_cells + Integer number of array of integers. + """ + mafs = sample_maf(n_snps, maf_min, maf_max, random) + + G = sample_genotype(n_individuals, mafs, random) + G = repeat(G, n_cells, axis=0) + G = column_normalize(G) + + n_samples = G.shape[0] + individual_groups = array_split(range(n_samples), n_individuals) + + env_groups = array_split(random.permutation(range(n_samples)), n_env_groups) + E = create_environment_matrix(n_samples, n_env, env_groups, random) + + Lk, K = sample_covariance_matrix(n_samples, individual_groups) + + beta_g = sample_persistent_effsizes(n_snps, g_causals, variances.g, random) + + y_g = sample_persistent_effects(G, beta_g, variances.g) + + y_gxe = sample_gxe_effects(G, E, gxe_causals, variances.gxe, random) + + y_k = sample_random_effect(Lk, variances.k, random) + + y_e = sample_random_effect(E, variances.e, random) + + y_n = sample_noise_effects(n_samples, variances.n, random) + + M = ones((K.shape[0], 1)) + y = offset + y_g + y_gxe + y_k + y_e + y_n + + simulation = Simulation( + mafs=mafs, + offset=offset, + beta_g=beta_g, + y_g=y_g, + y_gxe=y_gxe, + y_k=y_k, + y_e=y_e, + y_n=y_n, + y=y, + variances=variances, + Lk=Lk, + K=K, + E=E, + G=G, + M=M, + ) + + return simulation + + +def _ensure_moments(arr, mean: float, variance: float): + arr -= arr.mean(0) + mean + with errstate(divide="raise", invalid="raise"): + arr /= arr.std(0) + arr *= sqrt(variance) + + +def _symmetric_decomp(H): + [U, S, _] = economic_svd(H) + return ddot(U, sqrt(S)) diff --git a/cellregmap/_types.py b/cellregmap/_types.py new file mode 100644 index 0000000..895923a --- /dev/null +++ b/cellregmap/_types.py @@ -0,0 +1,8 @@ +from enum import Enum + +__all__ = ["Term"] + + +class Term(Enum): + FIXED = 1 + RANDOM = 2 diff --git a/struct_lmm2/test/__init__.py b/cellregmap/test/__init__.py similarity index 100% rename from struct_lmm2/test/__init__.py rename to cellregmap/test/__init__.py diff --git a/cellregmap/test/old_test_struct_lmm2.py b/cellregmap/test/old_test_struct_lmm2.py new file mode 100644 index 0000000..24c690d --- /dev/null +++ b/cellregmap/test/old_test_struct_lmm2.py @@ -0,0 +1,99 @@ +import pytest +import scipy.stats as stats +from numpy import array, asarray, eye, hstack, ones, sqrt, stack, tile, zeros +from numpy.random import RandomState +from numpy.testing import assert_allclose + +from struct_lmm2 import StructLMM2 +from struct_lmm2._simulate import ( + column_normalize, + create_environment_matrix, + sample_covariance_matrix, + sample_genotype, + sample_maf, + sample_persistent_effsizes, + create_variances, +) + + +@pytest.fixture +def data(): + seed = 0 + random = RandomState(seed) + n_samples = 100 + maf_min = 0.05 + maf_max = 0.45 + n_snps = 20 + # SNPs with persistent effect + g_snps = [5, 6] + # SNPs with interaction effect + gxe_snps = [10, 11] + # Contribution of interactions (proportion) + r0 = 0.8 + v0 = 0.4 + + # Simulate two environments + E = create_environment_matrix(n_samples) + + mafs = sample_maf(n_snps, maf_min, maf_max, random) + G = sample_genotype(n_samples, mafs, random) + # We normalize the columns of G so that we have 𝔼[𝐠ᵀ𝐠] = 1. + G = column_normalize(G) + + K = sample_covariance_matrix(n_samples, random) + v = create_variances(r0, v0) + beta_g = sample_effect_sizes(n_snps, g_snps, v["v_g"], random) + beta_gxe = sample_effect_sizes(n_snps, gxe_snps, v["v_gxe"], random) + y_g = G @ beta_g + + for i in range(n_snps): + # beta_gxe = random.multivariate_normal(zeros(n_samples), sigma_gxe[i] * Sigma) + beta_gxe = sigma_gxe[i] * u_gxe + y_gxe += G[:, i] * beta_gxe + + # return {"y": y, "W": W, "E": E, "K": K, "G": G} + + +@pytest.mark.skip("Not working yet.") +def test_structlmm_int_2kinships(): + random = RandomState(0) + n = 100 + c = 2 + y = random.randn(n) + M = ones((n, 1)) + W = random.randn(n, c) + E = random.randn(n, 4) + + p_values1 = [] + p_values2 = [] + + for i in range(10): + random = RandomState(i) + g = random.randn(n) + lmm2 = StructLMM2(y, W, E) + lmm2.fit_null(g) + _p = lmm2.score_2_dof(g) + p_values2.append(_p) + + y = y.reshape(y.shape[0], 1) + lmm = StructLMM(y, M, E) + g = g.reshape(g.shape[0], 1) + covs1 = hstack((W, g)) + lmm.fit(covs1) + _p = lmm.score_2dof_inter(g) + p_values1.append(_p) + + print(stats.pearsonr(p_values2, p_values1)[0]) + assert_allclose(p_values2, p_values1, rtol=1e-4) + + +@pytest.mark.skip("Not working yet.") +def test_structlmm2_interaction(data): + y = data["y"] + W = data["W"] + E = data["E"] + K = data["K"] + G = data["G"] + lmm2 = StructLMM2(y, W, E, K) + print(lmm2.scan_interaction(G)) + pass diff --git a/cellregmap/test/test_cellregmap.py b/cellregmap/test/test_cellregmap.py new file mode 100644 index 0000000..e69de29 diff --git a/cellregmap/test/test_fixed_gxe.py b/cellregmap/test/test_fixed_gxe.py new file mode 100644 index 0000000..4327212 --- /dev/null +++ b/cellregmap/test/test_fixed_gxe.py @@ -0,0 +1,137 @@ +import numpy as np +from glimix_core.lmm import LMM +from numpy import arange, asarray, clip, concatenate, inf, median, newaxis +from numpy.random import RandomState, default_rng +from numpy_sugar import epsilon +from numpy_sugar.linalg import economic_qs, economic_qs_linear +from scipy.stats import chi2 + +from cellregmap import create_variances, sample_phenotype_fixed_gxe + + +def test_fixed_gxe(): + random = default_rng(20) + + n_individuals = 100 + # n_individuals = 200 + # n_individuals = 500 + # n_individuals = 250 + maf_min = 0.20 + # maf_min = 0.05 + maf_max = 0.45 + # n_snps = 250 + n_snps = 100 + # n_snps = 50 + # n_snps = 20 + # n_cells = 100 + # n_cells = 10 + # n_cells = 2 + n_cells = arange(n_individuals) + 1 + # n_cells = 1 + n_env_groups = 2 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + + v = create_variances(r0, v0) + + s = sample_phenotype_fixed_gxe( + offset=offset, + n_individuals=n_individuals, + n_snps=n_snps, + n_cells=n_cells, + n_env_groups=n_env_groups, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + QS = economic_qs_linear(s.Lk) + # QS = economic_qs(s.K) + + # Test 1: + # H0: s.y_e + s.y_k + s.y_n + # H1: s.y_g + s.y_e + s.y_k + s.y_n + y = s.offset + s.y_g + 0*s.y_gxe + s.y_e + s.y_k + s.y_n + M = concatenate([s.M, s.E], axis=1) + lmm = LMM(y, M, QS, restricted=False) + lmm.fit(verbose=False) + scanner = lmm.get_fast_scanner() + data1 = scanner.fast_scan(s.G) + # Asserting to make sure that DoF==1 + assert data1["effsizes1"].ndim == 1 + dof = 1 + lml0 = scanner.null_lml() + lml1 = data1["lml"] + pv = test1_pvalues = lrt_pvalues(lml0, lml1, dof) + breakpoint() + print(pv[5], pv[6], pv[10], pv[11], median(pv[12:])) + # arange : 4.782253229179153e-192 1.0433583537506347e-196 0.01091492319670193 0.0010927629879084017 4.8750903985107926e-05 + # 2 n_cells : 1.090975503699979e-08 4.899993877637627e-08 0.009198608906693148 0.4204345922908054 0.5335573850563654 + # 10 n_cells : 3.982864661957504e-08 1.9070517075766003e-08 0.09201571609949227 0.35721972899471366 0.543538454382785 + # 100 n_cells: 4.469523142654005e-08 3.0916502150797485e-09 0.07348502556111537 0.37843373021086246 0.5058335803047611 + + # Test 2: + # H0: s.y_g + s.y_e + s.y_k + s.y_n + # H1: s.y_gxe + s.y_g + s.y_e + s.y_k + s.y_n + y = s.offset + s.y_gxe + s.y_g + s.y_e + s.y_k + s.y_n + assert min(abs(y - s.y)) < 1e-10 + data2 = {"lml0": [], "lml1": []} + dof = 1 + for g in s.G.T: + g = g[:, newaxis] + M = concatenate([s.M, g, s.E], axis=1) + lmm = LMM(y, M, QS, restricted=False) + lmm.fit(verbose=False) + scanner = lmm.get_fast_scanner() + d = scanner.fast_scan(s.E * g) + # Asserting to make sure that DoF==1 + assert d["effsizes1"].ndim == 1 + lml0 = scanner.null_lml() + lml1 = d["lml"] + data2["lml0"].append(lml0) + data2["lml1"].append(lml1) + + lml0 = asarray(data2["lml0"]) + lml1 = concatenate(data2["lml1"]) + pv = test2_pvalues = lrt_pvalues(lml0, lml1, dof) + breakpoint() + print(pv[5], pv[6], pv[10], pv[11], median(pv[12:])) + # arange : 0.6084591769169483 3.6950357207986164e-10 3.457029241798351e-227 1.2283428436867437e-223 0.005415575098636543 + # 2 n_cells : 0.07359659661565397 0.4350647685877501 2.3937927457201096e-07 1.9917895864290987e-16 0.43616571604445775 + # 10 n_cells : 0.3210415979502985 0.007255621562641298 1.109487704555233e-32 1.0877864951200334e-44 0.17000683356223117 + # 100 n_cells: 0.25813564725186755 8.150568151907782e-22 2.2250738585072014e-308 2.2250738585072014e-308 0.00013995763429074527 + pass + +def compute_pvalues(): + stats["pv20"] = lrt_pvalues(stats["lml0"], stats["lml2"], stats["dof20"]) + +def lrt_pvalues(null_lml, alt_lmls, dof=1): + """ + Compute p-values from likelihood ratios. + + These are likelihood ratio test p-values. + + Parameters + ---------- + null_lml : float + Log of the marginal likelihood under the null hypothesis. + alt_lmls : array_like + Log of the marginal likelihoods under the alternative hypotheses. + dof : int + Degrees of freedom. + + Returns + ------- + pvalues : ndarray + P-values. + """ + lrs = clip(-2 * null_lml + 2 * asarray(alt_lmls, float), epsilon.super_tiny, inf) + pv = chi2(df=dof).sf(lrs) + return clip(pv, epsilon.super_tiny, 1 - epsilon.tiny) + diff --git a/cellregmap/test/test_math.py b/cellregmap/test/test_math.py new file mode 100644 index 0000000..75135de --- /dev/null +++ b/cellregmap/test/test_math.py @@ -0,0 +1,91 @@ +import pytest +from celregmap._math import ( + P_matrix, + QSCov, + qmin, + rsolve, + score_statistic, + score_statistic_distr_weights, + score_statistic_liu_params, +) +from numpy import array, eye +from numpy.random import RandomState +from numpy.testing import assert_allclose +from numpy_sugar.linalg import economic_qs + + +@pytest.fixture +def data(): + random = RandomState(0) + n_samples = 3 + n_covariates = 2 + + W = random.randn(n_samples, n_covariates) + + K0 = random.randn(n_samples, n_samples) + K0 = K0 @ K0.T + + v = 0.2 + K = v * K0 + eye(n_samples) + dK = K0 + + alpha = array([0.5, -0.2]) + y = random.multivariate_normal(W @ alpha, K) + + return {"y": y, "W": W, "K": K, "dK": dK} + + +def test_QSCov(data): + rank = 2 + K = data["K"][:, :rank] @ data["K"][:, :rank].T + n_samples = K.shape[0] + QS = economic_qs(K) + a = 0.2 + b = 0.3 + + finalK = a * K + b * eye(n_samples) + + qscov = QSCov(QS[0][0], QS[1], a, b) + v = array([0.3, -0.2, 0.19]) + + assert_allclose(finalK @ v, qscov.dot(v)) + assert_allclose(rsolve(finalK, v), qscov.solve(v)) + + +def test_P_matrix(data): + P = array( + [ + [0.50355613, -0.24203676, -0.34880245], + [-0.24203676, 0.11633617, 0.16765363], + [-0.34880245, 0.16765363, 0.24160792], + ] + ) + assert_allclose(P_matrix(data["W"], data["K"]), P) + + +def test_score_statistic(data): + q = score_statistic(data["y"], data["W"], data["K"], data["dK"]) + assert_allclose(q, 0.49961017073389324) + + +def test_score_statistic_distr_weights(data): + weights = score_statistic_distr_weights(data["W"], data["K"], data["dK"]) + assert_allclose(weights, array([4.55266277e-09, 3.46249449e-01]), atol=1e-7) + + +def test_score_statistic_liu_params(data): + q = score_statistic(data["y"], data["W"], data["K"], data["dK"]) + weights = score_statistic_distr_weights(data["W"], data["K"], data["dK"]) + params = score_statistic_liu_params(q, weights) + assert_allclose(params["pv"], 0.22966744652848403) + assert_allclose(params["mu_q"], 0.34624945394475326) + assert_allclose(params["sigma_q"], 0.48967066729451103) + assert_allclose(params["dof_x"], 1.0) + + +def test_qmin(data): + params = [] + params += [{"pv": 0.22966742, "mu_q": 0.34945, "sigma_q": 0.48670, "dof_x": 1.5}] + params += [{"pv": 0.65, "mu_q": 0.695, "sigma_q": 0.1, "dof_x": 0.7}] + expected = [0.5506645025120773, 0.7157125486956082] + assert_allclose(qmin(params), expected) diff --git a/cellregmap/test/test_simulation.py b/cellregmap/test/test_simulation.py new file mode 100644 index 0000000..5c54d07 --- /dev/null +++ b/cellregmap/test/test_simulation.py @@ -0,0 +1,217 @@ +import pytest +from numpy import logical_and, ones, tile, zeros +from numpy.linalg import matrix_rank +from numpy.random import RandomState +from numpy.testing import assert_, assert_allclose, assert_equal + +from cellregmap._simulate import ( + column_normalize, + create_environment_matrix, + create_variances, + sample_covariance_matrix, + sample_genotype, + sample_gxe_effects, + sample_maf, + sample_noise_effects, + sample_persistent_effsizes, + sample_phenotype, +) + + +def test_sample_maf(): + random = RandomState(0) + n_snps = 30 + maf_min = 0.2 + maf_max = 0.3 + mafs = sample_maf(n_snps, maf_min, maf_max, random) + assert_(all(logical_and(maf_min <= mafs, mafs <= maf_max))) + assert_(len(mafs) == n_snps) + + +def test_sample_genotype(): + random = RandomState(0) + n_samples = 3 + n_snps = 30 + maf_min = 0.2 + maf_max = 0.3 + mafs = sample_maf(n_snps, maf_min, maf_max, random) + G = sample_genotype(n_samples, mafs, random) + assert_(G.shape == (n_samples, n_snps)) + + A = set(list(G.ravel())) + B = set([0.0, 1.0, 2.0]) + assert_(A - B == set()) + + +def test_column_normalize(): + random = RandomState(0) + n_samples = 10 + n_snps = 30 + maf_min = 0.2 + maf_max = 0.3 + mafs = sample_maf(n_snps, maf_min, maf_max, random) + G = sample_genotype(n_samples, mafs, random) + G = column_normalize(G) + assert_allclose(G.mean(0), zeros(n_snps), atol=1e-7) + assert_allclose(G.std(0), ones(n_snps)) + + +def test_sample_covariance_matrix(): + random = RandomState(0) + n_samples = 5 + K = sample_covariance_matrix(n_samples, random) + assert_(K.shape == (n_samples, n_samples)) + assert_allclose(K.diagonal().mean(), 1.0) + assert_(matrix_rank(K) == K.shape[0]) + + n_rep = 2 + K = sample_covariance_matrix(n_samples, random, n_rep) + assert_(K.shape == (n_rep * n_samples, n_rep * n_samples)) + assert_allclose(K.diagonal().mean(), 1.0) + assert_(matrix_rank(K) == K.shape[0]) + + +def test_variances(): + r0 = 0.1 + v0 = 0.5 + + v = create_variances(r0, v0) + assert_allclose(v.g + v.gxe + v.k + v.e + v.n, 1.0) + assert_allclose(v.e, v.n) + assert_allclose(v.n, v.k) + + has_kinship = False + v = create_variances(r0, v0, has_kinship) + assert_allclose(v.g + v.gxe + v.e + v.n, 1.0) + assert_allclose(v.e, v.n) + assert_equal(v.k, None) + + +def test_sample_persistent_effsizes(): + random = RandomState(0) + n_effects = 10 + causal_indices = [3, 5] + variance = 0.9 + + with pytest.raises(IndexError): + sample_persistent_effsizes(2, causal_indices, variance, random) + + beta = sample_persistent_effsizes(n_effects, causal_indices, variance, random) + assert_allclose(beta.mean(), 0.0, atol=1e-7) + assert_allclose((beta ** 2).sum(), variance) + + +def test_sample_gxe_effects(): + random = RandomState(0) + n_samples = 10 + n_snps = 30 + n_rep = 2 + maf_min = 0.2 + maf_max = 0.3 + mafs = sample_maf(n_snps, maf_min, maf_max, random) + + G = sample_genotype(n_samples, mafs, random) + G = tile(G, (n_rep, 1)) + G = column_normalize(G) + + E = random.randn(5, 5) + E = create_environment_matrix(E, n_samples, n_rep, 3, random) + E = column_normalize(E) + + causal_indices = [3, 5, 8] + variance = 0.9 + + y2 = sample_gxe_effects(G, E, causal_indices, variance, random) + assert_allclose(y2.mean(), 0.0, atol=1e-7) + assert_allclose(y2.var(), variance) + + +# TODO: put it back +# def test_sample_environment_effects(): +# random = RandomState(0) +# n_samples = 10 + +# E = random.randn(5, 5) +# E = create_environment_matrix(E, n_samples, 2, 3, random) +# E = column_normalize(E) + +# variance = 0.3 +# y3 = sample_environment_effects(E, variance, random) + +# assert_allclose(y3.mean(), 0.0, atol=1e-7) +# assert_allclose(y3.var(), variance) + + +# def test_sample_population_effects(): +# random = RandomState(0) +# n_samples = 10 +# variance = 0.4 +# n_rep = 1 +# K = sample_covariance_matrix(n_samples, random, n_rep) +# y4 = sample_population_effects(K, variance, random) + +# assert_allclose(y4.mean(), 0.0, atol=1e-7) +# assert_allclose(y4.var(), variance) + + +def test_sample_noise_effects(): + random = RandomState(0) + n_samples = 10 + + variance = 0.3 + y5 = sample_noise_effects(n_samples, variance, random) + + assert_allclose(y5.mean(), 0.0, atol=1e-7) + assert_allclose(y5.var(), variance) + + +def test_sample_phenotype(): + from numpy import corrcoef + + random = RandomState(0) + + r0 = 0.1 + v0 = 0.5 + v = create_variances(r0, v0) + + E = random.randn(30, 10) + + offset = 0.3 + n_samples = 500 + n_rep = 2 + n_env = 3 + s = sample_phenotype( + offset=offset, + E=E, + n_samples=n_samples, + n_snps=300, + n_rep=n_rep, + n_env=n_env, + maf_min=0.1, + maf_max=0.4, + g_causals=[3, 4], + gxe_causals=[4, 5], + variances=v, + random=random, + ) + assert_allclose( + s.y_g.var() + s.y_gxe.var() + s.y_n.var() + s.y_e.var() + s.y_k.var(), 1.0 + ) + assert_(abs(corrcoef(s.y_g, s.y_gxe)[0, 1]) < 0.1) + assert_(abs(corrcoef(s.y_g, s.y_n)[0, 1]) < 0.1) + assert_(abs(corrcoef(s.y_g, s.y_e)[0, 1]) < 0.1) + assert_(abs(corrcoef(s.y_g, s.y_k)[0, 1]) < 0.1) + + assert_(abs(corrcoef(s.y_gxe, s.y_n)[0, 1]) < 0.1) + assert_(abs(corrcoef(s.y_gxe, s.y_e)[0, 1]) < 0.1) + assert_(abs(corrcoef(s.y_gxe, s.y_k)[0, 1]) < 0.1) + + assert_(abs(corrcoef(s.y_n, s.y_e)[0, 1]) < 0.1) + assert_(abs(corrcoef(s.y_n, s.y_k)[0, 1]) < 0.1) + + assert_(abs(corrcoef(s.y_e, s.y_k)[0, 1]) < 0.1) + assert_allclose(s.y, offset + s.y_g + s.y_gxe + s.y_n + s.y_e + s.y_k) + assert_allclose(s.y.mean(), offset) + + assert_(s.E.shape[0] == n_samples * n_rep) + assert_(s.E.shape[1] == n_env) diff --git a/cellregmap/test/test_struct_lmm2.py b/cellregmap/test/test_struct_lmm2.py new file mode 100644 index 0000000..22aa2e6 --- /dev/null +++ b/cellregmap/test/test_struct_lmm2.py @@ -0,0 +1,451 @@ +from numpy import arange, asarray, eye, median, min, ones +from numpy.linalg import cholesky +from numpy.random import RandomState, default_rng +from numpy.testing import assert_, assert_allclose + +from cellregmap import create_variances, sample_phenotype, sample_phenotype_gxe + + +# from struct_lmm import StructLMM +# @pytest.mark.skip +def test_struct_lmm2_assoc(): + random = RandomState(0) + + n_samples = 50 + maf_min = 0.05 + maf_max = 0.45 + n_snps = 20 + n_rep = 1 + n_env = 2 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + + v = create_variances(r0, v0) + E = random.normal(size=[n_samples, n_env]) + + s = sample_phenotype( + offset=offset, + E=E, + n_samples=n_samples, + n_snps=n_snps, + n_rep=n_rep, + n_env=n_env, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + M = ones((n_samples, 1)) + # slmm = StructLMM(s.y.copy(), M=M, E=s.E, W=s.E) + # slmm.fit(verbose=False) + + # p_values0 = [] + # print() + # for i in range(n_snps): + # g = s.G[:, [i]] + # p = slmm.score_2dof_assoc(g) + # print("{}\t{}".format(i, p)) + # p_values0.append(p) + + slmm2 = StructLMM2(s.y, M, s.E) + slmm2.fit_null_association() + p_values1 = slmm2.scan_association(s.G) + # for i, pv in enumerate(p_values1): + # print("{}\t{}".format(i, pv)) + + +def test_struct_lmm2_inter(): + random = RandomState(3) + + n_samples = 280 + # n_samples = 500 + maf_min = 0.05 + maf_max = 0.45 + n_snps = 20 + n_rep = 1 + n_env = 4 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + E = random.normal(size=[n_samples, 2]) + + v = create_variances(r0, v0) + + s = sample_phenotype( + offset=offset, + E=E, + n_samples=n_samples, + n_snps=n_snps, + n_rep=n_rep, + n_env=n_env, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + M = ones((n_samples, 1)) + + # p_values0 = [] + # for i in range(n_snps): + # g = s.G[:, [i]] + # Mg = concatenate([M, g], axis=1) + # slmm_int = StructLMM(s.y.copy(), M=Mg, E=s.E, W=s.E) + # slmm_int.fit(verbose=False) + # p = slmm_int.score_2dof_inter(g) + # print("{}\t{}".format(i, p)) + # p_values0.append(p) + + # TODO: add kinship to test it properly + slmm2 = StructLMM2(s.y, M, s.E) + pvals = slmm2.scan_interaction(s.G) + + causal_pvalues = [pvals[i] for i in range(len(pvals)) if i in gxe_causals] + noncausal_pvalues = [pvals[i] for i in range(len(pvals)) if i not in gxe_causals] + causal_pvalues = asarray(causal_pvalues) + noncausal_pvalues = asarray(noncausal_pvalues) + + assert_(all(causal_pvalues < 1e-7)) + assert_(all(noncausal_pvalues > 1e-3)) + + +def test_struct_lmm2_inter_kinship(): + random = RandomState(8) + + n_samples = 346 + maf_min = 0.05 + maf_max = 0.45 + n_snps = 20 + n_rep = 1 + n_env = 3 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + E = random.normal(size=[n_samples, 2]) + + v = create_variances(r0, v0) + + s = sample_phenotype( + offset=offset, + E=E, + n_samples=n_samples, + n_snps=n_snps, + n_rep=n_rep, + n_env=n_env, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + M = ones((n_samples, 1)) + + G_kinship = cholesky(s.K + eye(n_samples) * 1e-7) + slmm2 = StructLMM2(s.y, M, s.E, G_kinship) + pvals = slmm2.scan_interaction(s.G) + + causal_pvalues = [pvals[i] for i in range(len(pvals)) if i in gxe_causals] + noncausal_pvalues = [pvals[i] for i in range(len(pvals)) if i not in gxe_causals] + causal_pvalues = asarray(causal_pvalues) + noncausal_pvalues = asarray(noncausal_pvalues) + + assert_(all(causal_pvalues < 1e-7)) + assert_(all(noncausal_pvalues > 1e-3)) + + +def test_struct_lmm2_inter_kinship_permute(): + random = RandomState(2) + + n_samples = 500 + maf_min = 0.05 + maf_max = 0.45 + n_snps = 20 + n_rep = 1 + n_env = 3 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + E = random.normal(size=[n_samples, 2]) + + v = create_variances(r0, v0) + + s = sample_phenotype( + offset=offset, + E=E, + n_samples=n_samples, + n_snps=n_snps, + n_rep=n_rep, + n_env=n_env, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + M = ones((n_samples, 1)) + + G_kinship = cholesky(s.K + eye(n_samples) * 1e-7) + slmm2 = StructLMM2(s.y, M, s.E, G_kinship) + random = RandomState(1) + idx = random.permutation(s.E.shape[0]) + pvals = slmm2.scan_interaction(s.G, idx_E=idx) + assert_(median(pvals) > 0.3) + assert_(min(pvals) > 0.04) + + +def test_struct_lmm2_inter_kinship_repetition(): + import numpy as np + + random = default_rng(20) + + n_individuals = 100 + # n_individuals = 200 + # n_individuals = 500 + # n_individuals = 250 + maf_min = 0.40 + # maf_min = 0.20 + # maf_min = 0.05 + maf_max = 0.45 + # n_snps = 250 + n_snps = 100 + # n_snps = 50 + # n_snps = 20 + # n_cells = 100 + # n_cells = 10 + # n_cells = 2 + n_cells = arange(n_individuals) + 1 + # n_cells = 1 + n_env = 2 + n_env_groups = 3 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + + v = create_variances(r0, v0) + + s = sample_phenotype( + offset=offset, + n_individuals=n_individuals, + n_snps=n_snps, + n_cells=n_cells, + n_env=n_env, + n_env_groups=n_env_groups, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + slmm2 = StructLMM2(s.y, s.M, s.E, s.Lk) + # idx = random.permutation(s.E.shape[0]) + # pvals = slmm2.scan_interaction(s.G, idx_E=idx) + from scipy.stats.stats import pearsonr + + corr = {} + corr_pvals = {} + + corr["y_e"] = [pearsonr(s.y_e, s.G[:, i])[0] for i in range(s.G.shape[1])] + corr_pvals["y_e"] = [pearsonr(s.y_e, s.G[:, i])[1] for i in range(s.G.shape[1])] + + corr["y_n"] = [pearsonr(s.y_n, s.G[:, i])[0] for i in range(s.G.shape[1])] + corr_pvals["y_n"] = [pearsonr(s.y_n, s.G[:, i])[1] for i in range(s.G.shape[1])] + + pvals, info = slmm2.scan_interaction(s.G) + print(pearsonr(pvals, np.abs(corr["y_e"]))) + print(pearsonr(pvals, np.abs(corr["y_n"]))) + assert_(median(pvals) > 0.3) + assert_(min(pvals) > 0.04) + + +def test_struct_lmm2_inter_kinship_repetition_gxe(): + import numpy as np + + random = default_rng(20) + + n_individuals = 100 + # n_individuals = 200 + # n_individuals = 500 + # n_individuals = 250 + maf_min = 0.40 + # maf_min = 0.20 + # maf_min = 0.05 + maf_max = 0.45 + # n_snps = 250 + n_snps = 100 + # n_snps = 50 + # n_snps = 20 + # n_cells = 100 + # n_cells = 10 + # n_cells = 2 + n_cells = arange(n_individuals) + 1 + # n_cells = 1 + n_env = 2 + n_env_groups = 3 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + + v = create_variances(r0, v0) + + # Timing: + # - n_individuals + # - n_env_groups + # - n_samples + + s = sample_phenotype_gxe( + offset=offset, + n_individuals=n_individuals, + n_snps=n_snps, + n_cells=n_cells, + n_env=n_env, + n_env_groups=n_env_groups, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + slmm2 = StructLMM2(s.y, s.M, s.E, s.Ls) + # idx = random.permutation(s.E.shape[0]) + # pvals = slmm2.scan_interaction(s.G, idx_E=idx) + from scipy.stats.stats import pearsonr + + corr = {} + corr_pvals = {} + + corr["y_e"] = [pearsonr(s.y_e, s.G[:, i])[0] for i in range(s.G.shape[1])] + corr_pvals["y_e"] = [pearsonr(s.y_e, s.G[:, i])[1] for i in range(s.G.shape[1])] + + corr["y_n"] = [pearsonr(s.y_n, s.G[:, i])[0] for i in range(s.G.shape[1])] + corr_pvals["y_n"] = [pearsonr(s.y_n, s.G[:, i])[1] for i in range(s.G.shape[1])] + + pvals, info = slmm2.scan_interaction(s.G) + print(pearsonr(pvals, np.abs(corr["y_e"]))) + print(pearsonr(pvals, np.abs(corr["y_n"]))) + assert_(median(pvals) > 0.3) + assert_(min(pvals) > 0.04) + + +def test_struct_lmm2_inter_kinship_predict(): + random = RandomState(0) + + n_individuals = 100 + maf_min = 0.05 + maf_max = 0.45 + n_snps = 20 + n_cells = 2 + n_env_groups = 3 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + + v = create_variances(r0, v0) + + s = sample_phenotype_gxe( + offset=offset, + n_individuals=n_individuals, + n_snps=n_snps, + n_cells=n_cells, + n_env_groups=n_env_groups, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + slmm2 = StructLMM2(s.y, s.M, s.E, s.Ls) + beta_g_s, beta_gxe_s = slmm2.predict_interaction(s.G, s.mafs) + assert_allclose(beta_g_s[3], -0.07720025290188615) + assert_allclose(beta_g_s[-1], 0.022415332334122483) + assert_allclose(beta_gxe_s[1, 1], 0.010062608120425824) + assert_allclose(beta_gxe_s[13, -1], -0.05566938579548831) + + +def test_struct_lmm2_estimate_aggregate_environment(): + random = default_rng(20) + + n_individuals = 100 + # n_individuals = 200 + # n_individuals = 500 + # n_individuals = 250 + maf_min = 0.40 + # maf_min = 0.20 + # maf_min = 0.05 + maf_max = 0.45 + # n_snps = 250 + n_snps = 100 + # n_snps = 50 + # n_snps = 20 + # n_cells = 100 + # n_cells = 10 + n_cells = 2 + # n_cells = arange(n_individuals) + 1 + # n_cells = 1 + n_env_groups = 3 + offset = 0.3 + r0 = 0.5 + v0 = 0.5 + g_causals = [5, 6] + gxe_causals = [10, 11] + + v = create_variances(r0, v0) + + # Timing: + # - n_individuals + # - n_env_groups + # - n_samples + + s = sample_phenotype_gxe( + offset=offset, + n_individuals=n_individuals, + n_snps=n_snps, + n_cells=n_cells, + n_env_groups=n_env_groups, + maf_min=maf_min, + maf_max=maf_max, + g_causals=g_causals, + gxe_causals=gxe_causals, + variances=v, + random=random, + ) + + slmm2 = StructLMM2(s.y, s.M, s.E, s.Ls) + import numpy as np + for i in range(5): + a = slmm2.estimate_aggregate_environment(s.G[:, i]) + print(np.var(a)) + b = slmm2.estimate_aggregate_environment(s.G[:, 10]) + c = slmm2.estimate_aggregate_environment(s.G[:, 11]) + print(np.var(b)) + print(np.var(c)) + pass diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..566feac --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,61 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'CellRegMap' +copyright = '2022, Anna Cuomo' +author = 'Anna Cuomo' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.viewcode", +] + +autodoc_default_flags = ["members"] +autosummary_generate = True +napoleon_numpy_docstring = True + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +pygments_style = "default" +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/functions.rst b/docs/functions.rst new file mode 100644 index 0000000..191bcce --- /dev/null +++ b/docs/functions.rst @@ -0,0 +1,15 @@ +********** +CellRegMap +********** + +.. currentmodule:: cellregmap + +.. autosummary:: + :toctree: _autosummary + :template: class.rst + + CellRegMap + run_association + run_interaction + estimate_betas + # lrt_pvalues diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..4610cec --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,39 @@ +.. CellRegMap documentation master file, created by + sphinx-quickstart on Mon Jan 10 23:46:16 2022. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to CellRegMap's documentation! +====================================== + +What is CellRegMap +------------------ + +The cellular regulatory map (``cellregmap``) is a statistical framework to map context-specific effects of genetic variants on single-cell gene expression. +It builds on a linear mixed model and is implemented in Python. + +What do you need to run CellRegMap +---------------------------------- + +Single-cell expression profiles assayed through scRNA-seq in the form of a count matrix (cells x genes) + +Cellular contexts: these can be known factors or a latent representation of the space (e.g., PCs by cells) + +Genotypes: CellRegMap tests for individual eQTL effects between a genetic variant and a gene's expression, so genotypes need to be provided for variants of interest. + +A kinship matrix accounting for relatedness among samples. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + install + functions + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/install.rst b/docs/install.rst new file mode 100644 index 0000000..cb55400 --- /dev/null +++ b/docs/install.rst @@ -0,0 +1,26 @@ +****************** +Install CellRegMap +****************** + +Stable version +-------------- + +``cellregmap`` can be installed `from PyPI `_ with ``pip``: +:: + python3 -m pip install cellregmap + + +Development version +------------------- + +To install ``cellregmap`` in development mode, install it from the `GitHub repository `_: +:: + git clone https://github.com/limix/CellRegMap.git + cd CellRegMap + python3 -m pip install -e + + +Docker installation +------------------- + +If you use Docker, you can also pull the `pre-build image from dockerhub `_ diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..153be5e --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..3bcf96c --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +sphinx>=2.1.2 +# cellregmap +git+https://github.com/limix/CellRegMap.git diff --git a/get_data.py b/get_data.py deleted file mode 100644 index 8b532d0..0000000 --- a/get_data.py +++ /dev/null @@ -1,33 +0,0 @@ -import pandas as pd -import limix - -def get_genotype_data(geno_prefix): - bim,fam,bed = limix.io.plink.read(geno_prefix, verbose = False) - fam.set_index('iid', inplace = True) - return bim,fam,bed - -''' -given a df of phenotype vectors df_y -and a sample mapping file -mapping genotype samples (e.g. individuals) to phenotype samples (e.g. cells) -returns genotype samples corresponding to the samples in y -''' -def map_samples(df_y, sample_mapping): - df0 = pd.read_csv(sample_mapping, sep = " ") - df0 = df0.set_index("phenotype_sample") - return df0.loc[df_y.index, "genotype_sample"] - -''' -given a df of phenotype vectors df_y -and a kinship matrix -returns kinship only containing samples included in phenotype -with the possibility of mapping samples first -''' -def filter_kinship(K, df_y, sample_mapping = None): - if sample_mapping is not None: - samples = map_samples(df_y, sample_mapping) - else: samples = df_y.index - K_filtered = K[samples,:][:,samples] - return K_filtered - -kinship_mat = kinship_df.loc[individual_ids,individual_ids].values if kinship_df is not None else None \ No newline at end of file diff --git a/proof.md b/proof.md new file mode 100644 index 0000000..142ee3e --- /dev/null +++ b/proof.md @@ -0,0 +1,79 @@ +K = GGt, Sigma = EEt +M = K*Sigma + +Let & be the Hadamard product. + +We known that: + + rank(A&B) <= rank(A) * rank(B) + +Let + + K = U @ Dk @ Ut + Sigma = V @ Ds @ Vt + +for diagonal matrices Dk and Ds. + +K & Sigma = K & (Sum_i vi * lambda_i * vi^T ) = Sum_i (K & (vi @ lambda_i @ vi^T)) + = Sum_i ( Lki @ Lki^T ) + +Let e = |1 1 ... 1|. We have + + K & (vi @ lambda_i @ vi^T) = K & (ui @ ui^T) + = K & (Diag(ui) @ e @ e^T @ Diag(ui)) + = Diag(ui) @ (K & (e @ e^T)) @ Diag(ui) (See (2.10), [1]) + = Diag(ui) @ K @ Diag(ui) + = Diag(ui) @ G @ Gt @ Diag(ui) + = Lki @ Lki^T + +for Lki = Diag(ui) @ G. + +Trial +----- + +```python + import numpy as np + + K = np.random.randn(3, 3) + K = K @ K.T + + # rank-2 symmetric matrix + u0 = np.random.randn(3, 1) + u1 = np.random.randn(3, 1) + Sigma = u0 @ u0.T + u1 @ u1.T + sigma, U = np.linalg.eigh(Sigma) + + tmp = sigma[1] * U[:, [1]] @ U[:, [1]].T + sigma[2] * U[:, [2]] @ U[:, [2]].T + # Close to zero + print(tmp - Sigma) + + u1 = np.sqrt(sigma[1]) * U[:, [1]] + u2 = np.sqrt(sigma[2]) * U[:, [2]] + + tmp = u1 @ u1.T + u2 @ u2.T - Sigma + # Close to zero + print(tmp) + + e = np.ones(3) + + G = np.linalg.cholesky(K) + + Lk1 = np.diag(u1.ravel()) @ G + Lk2 = np.diag(u2.ravel()) @ G + + # Close to zero + print(Lk1 @ Lk1.T + Lk2 @ Lk2.T - K * Sigma) +``` + +Goal +---- + +- tr(K & Sigma) (there is a property) +- det(K & Sigma) +- (K & Sigma)^{-1} @ x + +References +---------- + +[1] Hadamard Products and Multivariate Statistical Analysis. + https://core.ac.uk/download/pdf/82272966.pdf diff --git a/greater power and computational efficiency.pdf.bz2 b/references/greater power and computational efficiency.pdf.bz2 similarity index 100% rename from greater power and computational efficiency.pdf.bz2 rename to references/greater power and computational efficiency.pdf.bz2 diff --git a/rachel thesis part 1.pdf.bz2 b/references/rachel thesis part 1.pdf.bz2 similarity index 100% rename from rachel thesis part 1.pdf.bz2 rename to references/rachel thesis part 1.pdf.bz2 diff --git a/run_structLMM2.py b/run_structLMM2.py deleted file mode 100644 index cbbe4f5..0000000 --- a/run_structLMM2.py +++ /dev/null @@ -1,188 +0,0 @@ -def _mean_standardize(X, axis = None): - from numpy_sugar import epsilon - from numpy import inf - - X = X - np.nanmean(X,0) - X = X / np.nanstd(X,0) - - return X - -def get_genotype_data(geno_prefix): - bim,fam,bed = limix.io.plink.read(geno_prefix, verbose = False) - fam.set_index('iid', inplace = True) - return bim,fam,bed - -def run_structLMM_QTL_analysis_load_intersect_phenotype_environments_covariates_kinship_sample_mapping\ - (pheno_filename, anno_filename, env_filename, geno_prefix, plinkGenotype, - cis_mode = True, interaction_mode = True, - relatedness_score = 0.95, snps_filename = None, feature_filename = None, - snp_feature_filename = None, selection = 'all', covariates_filename = None, kinship_filename = None, - sample_mapping_filename = None, feature_variant_covariate_filename = None): - selectionStart = None - selectionEnd = None - if(":" in selection): - parts = selection.split(":") - if("-" not in parts[1]): - print("No correct sub selection.") - print("Given in: "+selection) - print("Expected format: (chr number):(start location)-(stop location)") - sys.exit() - chromosome = parts[0] - if("-" in parts[1]): - parts2 = parts[1].split("-") - selectionStart = int(parts2[0]) - selectionEnd = int(parts2[1]) - else : - chromosome = selection - - ''' function to take input and intersect sample and genotype.''' - #Load input data files & filter for relevant data - #Load input data filesf - - phenotype_df = qtl_loader_utils.get_phenotype_df(pheno_filename) - annotation_df = qtl_loader_utils.get_annotation_df(anno_filename) - - if(plinkGenotype): - bim,fam,bed = get_genotype_data(geno_prefix) - annotation_df.replace(['X', 'Y', 'XY', 'MT'], ['23', '24', '25', '26'], inplace = True) - if chromosome == 'X' : - chromosome = '23' - elif chromosome == 'Y': - chromosome = '24' - elif chromosome == 'XY': - chromosome = '25' - elif chromosome == 'MT': - chromosome = '26' - - - else : - geno_prefix+='.bgen' - print(geno_prefix) - print("Intersecting data.") - - if(annotation_df.shape[0] != annotation_df.groupby(annotation_df.index).first().shape[0]): - print("Only one location per feature supported. If multiple locations are needed please look at: --extended_anno_file") - sys.exit() - - ##Make sure that there is only one entry per feature id!. - sample2individual_df = qtl_loader_utils.get_samplemapping_df(sample_mapping_filename, list(phenotype_df.columns), 'sample') - sample2individual_df['sample'] = sample2individual_df.index - sample2individual_df = sample2individual_df.drop_duplicates(); - - - ##Filter first the linking files! - #Subset linking to relevant genotypes. - orgSize = sample2individual_df.shape[0] - sample2individual_df = sample2individual_df.loc[sample2individual_df['iid'].map(lambda x: x in list(map(str, fam.index))),:] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: "+str(diff)+" samples because they are not present in the genotype file.") - - #Subset linking to relevant phenotypes. - sample2individual_df = sample2individual_df.loc[np.intersect1d(sample2individual_df.index,phenotype_df.columns),:] - diff = orgSize- sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the phenotype file.") - #Subset linking vs kinship. - kinship_df = qtl_loader_utils.get_kinship_df(kinship_filename) - if kinship_df is not None: - #Filter from individual2sample_df & sample2individual_df since we don't want to filter from the genotypes. - sample2individual_df = sample2individual_df[sample2individual_df['iid'].map(lambda x: x in list(map(str, kinship_df.index)))] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the kinship file.") - #Subset linking vs covariates. - covariate_df = qtl_loader_utils.get_covariate_df(covariates_filename) - if covariate_df is not None: - if np.nansum(covariate_df == 1, 0).max() < covariate_df.shape[0]: covariate_df.insert(0, 'ones',np.ones(covariate_df.shape[0])) - sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(covariate_df.index)),:] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the covariate file.") - #Subset linking vs environments. - environment_df = qtl_loader_utils.get_env_df(env_filename) - # import pdb; pdb.set_trace() - # if np.nansum(environment_df == 1, 0).max() < environment_df.shape[0]: environment_df.insert(0, 'ones',np.ones(environment_df.shape[0])) - sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(environment_df.index)),:] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the environment file.") - - ### - print("Number of samples with genotype & phenotype data: " + str(sample2individual_df.shape[0])) - if(sample2individual_df.shape[0] < minimum_test_samples): - print("Not enough samples with both genotype & phenotype data.") - sys.exit() - - ##Filter now the actual data! - #Filter phenotype data based on the linking files. - phenotype_df = phenotype_df.loc[list(set(phenotype_df.index) & set(annotation_df.index)), sample2individual_df.index.values] - - #Filter kinship data based on the linking files. - genetically_unique_individuals = None - if kinship_df is not None: - kinship_df = kinship_df.loc[np.intersect1d(kinship_df.index, sample2individual_df['iid']), np.intersect1d(kinship_df.index, sample2individual_df['iid'])] - genetically_unique_individuals = utils.get_unique_genetic_samples(kinship_df, relatedness_score); - - #Filter covariate data based on the linking files. - if covariate_df is not None: - covariate_df = covariate_df.loc[np.intersect1d(covariate_df.index, sample2individual_df.index.values),:] - - snp_feature_filter_df = qtl_loader_utils.get_snp_feature_df(snp_feature_filename) - try: - feature_filter_df = qtl_loader_utils.get_snp_df(feature_filename) - except: - if feature_filename is not None: - feature_filter_df = pd.DataFrame(index = feature_filename) - #Do filtering on features. - if feature_filter_df is not None: - phenotype_df = phenotype_df.loc[feature_filter_df.index,:] - ##Filtering on features to test. - if snp_feature_filter_df is not None: - phenotype_df = phenotype_df.loc[np.unique(snp_feature_filter_df['feature']),:] - ##Filtering on features to test from the combined feature snp filter. - - if ((not cis_mode) and len(set(bim['chrom'])) < 22) : - print("Warning, running a trans-analysis on snp data from less than 22 chromosomes.\nTo merge data later the permutation P-values need to be written out.") - - if(cis_mode): - #Remove features from the annotation that are on chromosomes which are not present anyway. - annotation_df = annotation_df[np.in1d(annotation_df['chromosome'], list(set(bim['chrom'])))] - - #Prepare to filter on snps. - snp_filter_df = qtl_loader_utils.get_snp_df(snps_filename) - if snp_filter_df is not None: - toSelect = set(snp_filter_df.index).intersection(set(bim['snp'])) - bim = bim.loc[bim['snp'].isin(toSelect)] - ##Filtering on SNPs to test from the snp filter. - - if snp_feature_filter_df is not None: - toSelect = set(np.unique(snp_feature_filter_df['snp_id'])).intersection(set(bim['snp'])) - bim = bim.loc[bim['snp'].isin(toSelect)] - ##Filtering on features to test from the combined feature snp filter. - - - #Determine features to be tested - if chromosome=='all': - feature_list = list(set(annotation_df.index) & set(phenotype_df.index)) - else: - if not selectionStart is None : - lowest = min([selectionStart,selectionEnd]) - highest = max([selectionStart,selectionEnd]) - annotation_df['mean'] = ((annotation_df["start"] + annotation_df["end"])/2) - feature_list = list(set(annotation_df.iloc[(annotation_df['chromosome'].values == chromosome) & (annotation_df['mean'].values >= lowest) & (annotation_df["mean"].values < highest)].index.values) & set(phenotype_df.index)) - del annotation_df['mean'] - else : - feature_list = list(set(annotation_df[annotation_df['chromosome'] == chromosome].index) & set(phenotype_df.index)) - - print("Number of features to be tested: " + str(len(feature_list))) - print("Total number of variants to be considered, before variante QC and feature intersection: " + str(bim.shape[0])) - - - feature_variant_covariate_df = qtl_loader_utils.get_snp_feature_df(feature_variant_covariate_filename) - return [phenotype_df, kinship_df, covariate_df, environment_df, sample2individual_df, complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, genetically_unique_individuals, minimum_test_samples, feature_list,bim,fam,bed, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df] - - - - - diff --git a/run_structLMM2_int.py b/run_structLMM2_int.py deleted file mode 100644 index 5d60962..0000000 --- a/run_structLMM2_int.py +++ /dev/null @@ -1,381 +0,0 @@ - -import sys -sys.path.insert(0,'/hps/nobackup/stegle/users/mjbonder/tools/hipsci_pipeline/limix_QTL_pipeline/') - -# import glob -# import os -# # from subprocess import run -# import pandas as pd -# import re -# from os.path import join -# import scipy as sp -import numpy as np -# # from struct_lmm.interpretation import PredictGenEffect -from sklearn.preprocessing import Imputer -import limix -# import qtl_output -import qtl_loader_utils -# import qtl_parse_args -# import qtl_utilities as utils -# from bgen_reader import read_bgen -# from numpy import linalg as LA -# import scipy.linalg as la -from tqdm import tqdm - -from limix_core.covar import FreeFormCov -from limix_core.gp import GP2KronSumLR - -def _mean_standardize(X, axis = None): - from numpy_sugar import epsilon - from numpy import inf - - X = X - np.nanmean(X,0) - X = X / np.nanstd(X,0) - - return X - -def get_genotype_data(geno_prefix): - bim,fam,bed = limix.io.plink.read(geno_prefix, verbose = False) - fam.set_index('iid', inplace = True) - return bim,fam,bed - -def run_structLMM2_int_load_intersect_phenotype_environments_covariates_kinship_sample_mapping\ - (pheno_filename, anno_filename, env_filename, geno_prefix, plinkGenotype, - cis_mode = True, interaction_mode = True, - relatedness_score = 0.95, snps_filename = None, feature_filename = None, - snp_feature_filename = None, selection = 'all', covariates_filename = None, kinship_filename = None, - sample_mapping_filename = None, feature_variant_covariate_filename = None): - selectionStart = None - selectionEnd = None - if(":" in selection): - parts = selection.split(":") - if("-" not in parts[1]): - print("No correct sub selection.") - print("Given in: "+selection) - print("Expected format: (chr number):(start location)-(stop location)") - sys.exit() - chromosome = parts[0] - if("-" in parts[1]): - parts2 = parts[1].split("-") - selectionStart = int(parts2[0]) - selectionEnd = int(parts2[1]) - else : - chromosome = selection - - ''' function to take input and intersect sample and genotype.''' - #Load input data files & filter for relevant data - #Load input data filesf - - phenotype_df = qtl_loader_utils.get_phenotype_df(pheno_filename) - annotation_df = qtl_loader_utils.get_annotation_df(anno_filename) - - if(plinkGenotype): - bim,fam,bed = get_genotype_data(geno_prefix) - annotation_df.replace(['X', 'Y', 'XY', 'MT'], ['23', '24', '25', '26'], inplace = True) - if chromosome == 'X' : - chromosome = '23' - elif chromosome == 'Y': - chromosome = '24' - elif chromosome == 'XY': - chromosome = '25' - elif chromosome == 'MT': - chromosome = '26' - - - else : - geno_prefix+='.bgen' - print(geno_prefix) - print("Intersecting data.") - - if(annotation_df.shape[0] != annotation_df.groupby(annotation_df.index).first().shape[0]): - print("Only one location per feature supported. If multiple locations are needed please look at: --extended_anno_file") - sys.exit() - - ##Make sure that there is only one entry per feature id!. - sample2individual_df = qtl_loader_utils.get_samplemapping_df(sample_mapping_filename, list(phenotype_df.columns), 'sample') - sample2individual_df['sample'] = sample2individual_df.index - sample2individual_df = sample2individual_df.drop_duplicates(); - - - ##Filter first the linking files! - #Subset linking to relevant genotypes. - orgSize = sample2individual_df.shape[0] - sample2individual_df = sample2individual_df.loc[sample2individual_df['iid'].map(lambda x: x in list(map(str, fam.index))),:] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: "+str(diff)+" samples because they are not present in the genotype file.") - - #Subset linking to relevant phenotypes. - sample2individual_df = sample2individual_df.loc[np.intersect1d(sample2individual_df.index,phenotype_df.columns),:] - diff = orgSize- sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the phenotype file.") - #Subset linking vs kinship. - kinship_df = qtl_loader_utils.get_kinship_df(kinship_filename) - if kinship_df is not None: - #Filter from individual2sample_df & sample2individual_df since we don't want to filter from the genotypes. - sample2individual_df = sample2individual_df[sample2individual_df['iid'].map(lambda x: x in list(map(str, kinship_df.index)))] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the kinship file.") - #Subset linking vs covariates. - covariate_df = qtl_loader_utils.get_covariate_df(covariates_filename) - if covariate_df is not None: - if np.nansum(covariate_df == 1, 0).max() < covariate_df.shape[0]: covariate_df.insert(0, 'ones',np.ones(covariate_df.shape[0])) - sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(covariate_df.index)),:] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the covariate file.") - #Subset linking vs environments. - environment_df = qtl_loader_utils.get_env_df(env_filename) - # import pdb; pdb.set_trace() - # if np.nansum(environment_df == 1, 0).max() < environment_df.shape[0]: environment_df.insert(0, 'ones',np.ones(environment_df.shape[0])) - sample2individual_df = sample2individual_df.loc[list(set(sample2individual_df.index) & set(environment_df.index)),:] - diff = orgSize - sample2individual_df.shape[0] - orgSize = sample2individual_df.shape[0] - print("Dropped: " + str(diff) + " samples because they are not present in the environment file.") - - ### - print("Number of samples with genotype & phenotype data: " + str(sample2individual_df.shape[0])) - - - ##Filter now the actual data! - #Filter phenotype data based on the linking files. - phenotype_df = phenotype_df.loc[list(set(phenotype_df.index) & set(annotation_df.index)), sample2individual_df.index.values] - - #Filter kinship data based on the linking files. - genetically_unique_individuals = None - if kinship_df is not None: - kinship_df = kinship_df.loc[np.intersect1d(kinship_df.index, sample2individual_df['iid']), np.intersect1d(kinship_df.index, sample2individual_df['iid'])] - genetically_unique_individuals = utils.get_unique_genetic_samples(kinship_df, relatedness_score); - - #Filter covariate data based on the linking files. - if covariate_df is not None: - covariate_df = covariate_df.loc[np.intersect1d(covariate_df.index, sample2individual_df.index.values),:] - - snp_feature_filter_df = qtl_loader_utils.get_snp_feature_df(snp_feature_filename) - try: - feature_filter_df = qtl_loader_utils.get_snp_df(feature_filename) - except: - if feature_filename is not None: - feature_filter_df = pd.DataFrame(index = feature_filename) - #Do filtering on features. - if feature_filter_df is not None: - phenotype_df = phenotype_df.loc[feature_filter_df.index,:] - ##Filtering on features to test. - if snp_feature_filter_df is not None: - phenotype_df = phenotype_df.loc[np.unique(snp_feature_filter_df['feature']),:] - ##Filtering on features to test from the combined feature snp filter. - - if ((not cis_mode) and len(set(bim['chrom'])) < 22) : - print("Warning, running a trans-analysis on snp data from less than 22 chromosomes.\nTo merge data later the permutation P-values need to be written out.") - - if(cis_mode): - #Remove features from the annotation that are on chromosomes which are not present anyway. - annotation_df = annotation_df[np.in1d(annotation_df['chromosome'], list(set(bim['chrom'])))] - - #Prepare to filter on snps. - snp_filter_df = qtl_loader_utils.get_snp_df(snps_filename) - if snp_filter_df is not None: - toSelect = set(snp_filter_df.index).intersection(set(bim['snp'])) - bim = bim.loc[bim['snp'].isin(toSelect)] - ##Filtering on SNPs to test from the snp filter. - - if snp_feature_filter_df is not None: - toSelect = set(np.unique(snp_feature_filter_df['snp_id'])).intersection(set(bim['snp'])) - bim = bim.loc[bim['snp'].isin(toSelect)] - ##Filtering on features to test from the combined feature snp filter. - - - #Determine features to be tested - if chromosome=='all': - feature_list = list(set(annotation_df.index) & set(phenotype_df.index)) - else: - if not selectionStart is None : - lowest = min([selectionStart,selectionEnd]) - highest = max([selectionStart,selectionEnd]) - annotation_df['mean'] = ((annotation_df["start"] + annotation_df["end"])/2) - feature_list = list(set(annotation_df.iloc[(annotation_df['chromosome'].values == chromosome) & (annotation_df['mean'].values >= lowest) & (annotation_df["mean"].values < highest)].index.values) & set(phenotype_df.index)) - del annotation_df['mean'] - else : - feature_list = list(set(annotation_df[annotation_df['chromosome'] == chromosome].index) & set(phenotype_df.index)) - - print("Number of features to be tested: " + str(len(feature_list))) - print("Total number of variants to be considered, before variante QC and feature intersection: " + str(bim.shape[0])) - - - feature_variant_covariate_df = qtl_loader_utils.get_snp_feature_df(feature_variant_covariate_filename) - return [phenotype_df, kinship_df, covariate_df, environment_df, sample2individual_df, complete_annotation_df, annotation_df, snp_filter_df, snp_feature_filter_df, genetically_unique_individuals, minimum_test_samples, feature_list,bim,fam,bed, chromosome, selectionStart, selectionEnd, feature_variant_covariate_df] - - -import struct_lmm2 as StructLMM2 - -def run_structLMM2_int(pheno_filename, anno_filename, env_filename, geno_prefix, output_dir, plinkGenotype = True, - window_size = 250000, min_maf = 0.05, min_hwe_P = 0.001, min_call_rate = 0.95, - interaction_mode = True, cis_mode = True, gaussianize_method = None, - seed = np.random.randint(40000), relatedness_score = 0.95, - feature_variant_covariate_filename = None, snps_filename = None, feature_filename = None, - snp_feature_filename = None, genetic_range = 'all', covariates_filename = None, - kinship_filename = None, sample_mapping_filename = None): - fill_NaN = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0) - - [phenotype_df, kinship_df, covariate_df, environment_df, sample2individual_df, annotation_df, snp_filter_df, - snp_feature_filter_df, feature_list, bim, fam, bed, - chromosome, selectionStart, selectionEnd, feature_variant_covariate_df]=\ - run_structLMM2_int_load_intersect_phenotype_environments_covariates_kinship_sample_mapping(pheno_filename = pheno_filename, - anno_filename = anno_filename, env_filename = env_filename, geno_prefix = geno_prefix, - plinkGenotype = plinkGenotype, cis_mode = cis_mode, interaction_mode = interaction_mode, - relatedness_score = relatedness_score, snps_filename = snps_filename, - feature_filename = feature_filename, snp_feature_filename = snp_feature_filename, - selection = genetic_range, covariates_filename = covariates_filename, - kinship_filename = kinship_filename, sample_mapping_filename = sample_mapping_filename, - feature_variant_covariate_filename = feature_variant_covariate_filename) - - #import pdb; pdb.set_trace() - if(feature_list == None or len(feature_list) == 0): - print ('No features to be tested.') - sys.exit() - - #Arrays to store number of samples and genetic effects - n_samples = [] - n_e_samples = [] - currentFeatureNumber = 0 - pvs = list() - - - for feature_id in tqdm(feature_list): - currentFeatureNumber+= 1 - - snp_list = snp_feature_filter_df['snp_id'].loc[snp_feature_filter_df['feature'] == feature_id] - snpQuery = bim.query('snp in @snp_list') - - if len(snpQuery) != 0: - phenotype_ds = phenotype_df.loc[feature_id] - contains_missing_samples = any(~np.isfinite(phenotype_ds)) - - # import pdb; pdb.set_trace() - - if(contains_missing_samples): - print ('Feature: ' + feature_id + ' contains missing data.') - phenotype_ds.dropna(inplace = True) - - '''select indices for relevant individuals in genotype matrix - These are not unique. NOT to be used to access phenotype/covariates data - ''' - individual_ids = sample2individual_df.loc[phenotype_ds.index, 'iid'].values - sample2individual_feature = sample2individual_df.loc[phenotype_ds.index] - - if(contains_missing_samples): - tmp_unique_individuals = genetically_unique_individuals - genetically_unique_individuals = utils.get_unique_genetic_samples(kinship_df.loc[individual_ids, individual_ids], relatedness_score); - - if phenotype_ds.empty or (len(genetically_unique_individuals) < minimum_test_samples) : - print("Feature: " + feature_id + " not tested: not enough samples do QTL test.") - # fail_qc_features.append(feature_id) - if contains_missing_samples: - genetically_unique_individuals = tmp_unique_individuals - continue - elif np.var(phenotype_ds.values) == 0: - print("Feature: " + feature_id + " has no variance in selected individuals.") - # fail_qc_features.append(feature_id) - if contains_missing_samples: - genetically_unique_individuals = tmp_unique_individuals - continue - - - #import pdb; pdb.set_trace() - n_sample_t = (phenotype_ds.size) - n_e_samples_t = (len(genetically_unique_individuals)) - print ('For feature: ' + str(currentFeatureNumber) + '/' +str(len(feature_list)) + ' (' + feature_id + '): ' + str(snpQuery.shape[0]) + ' SNPs need to be tested.\n Please stand by.') - - # import pdb; pdb.set_trace() - - - for snpGroup in utils.chunker(snpQuery, blocksize): - snp_idxs = snpGroup['i'].values - snp_names = snpGroup['snp'].values - - - #subset genotype matrix, we cannot subselect at the same time, do in two steps. - snp_df = pd.DataFrame(data = bed[snp_idxs,:].compute().transpose(), index = fam.index, columns = snp_names) - snp_df = snp_df.loc[individual_ids,:] - - - #We could make use of relatedness when imputing. - snp_matrix_DF = pd.DataFrame(fill_NaN.fit_transform(snp_df), index = snp_df.index, columns = snp_df.columns) - snp_df = None - -# test if the environments, covariates, kinship, snp and phenotype are in the same order - if ((all(snp_matrix_DF.index == kinship_df.loc[individual_ids,individual_ids].index) if kinship_df is not None else True) &\ - (all(phenotype_ds.index == covariate_df.loc[sample2individual_feature['sample'],:].index)if covariate_df is not None else True)&\ - all(phenotype_ds.index == environment_df.loc[sample2individual_feature['sample'],:].index)&\ - all(snp_matrix_DF.index == sample2individual_feature.loc[phenotype_ds.index]['iid'])): - ''' - if all lines are in order put in arrays the correct genotype and phenotype - x=a if cond1 else b <---> equivalent to if cond1: x=a else x=b; - better readability of the code - ''' - kinship_mat = kinship_df.loc[individual_ids,individual_ids].values if kinship_df is not None else None - env = environment_df.loc[sample2individual_feature['sample'],:].values - if covariate_df is not None: - cov_matrix = covariate_df.loc[sample2individual_feature['sample'],:].values - else: - cov_matrix = None - - phenotype = utils.force_normal_distribution(phenotype_ds.values, method = gaussianize_method) if gaussianize_method is not None else phenotype_ds.values - else: - print ('There is an issue in mapping phenotypes and genotypes') - sys.exit() - - - n_indep_snps = sum(limix.qc.indep_pairwise(snp_matrix_DF.values, window_size, window_size, threshold, verbose = True)) - print(n_indep_snps) - # loop over SNPs to estimate individual effect sizes - for snp in snp_names: - g = snp_matrix_DF.loc[:,snp].values - g = g.T.reshape(g.shape[0],1) - y = phenotype - y = y.T.reshape(y.shape[0],1) - W = cov_matrix - X = np.concatenate((W, g[:, np.newaxis]), axis = 1) - E = env - # fit null model - slmm_int = StructLMM2(y, W, E) - null = slmm_int.fit_null(g) - _p = slmm_int.score_2_dof(g) - # import pdb; pdb.set_trace() - pvs.append([snp, feature_id, _p, _p*n_indep_snps]) - - - pvs_df = pd.DataFrame(pvs, columns = ["snp_id","feature_id","p_value","feature_corrected_p_value"]) - - if not selectionStart is None : - # add if statements - pvs_df.to_csv(output_dir + '/eqtl_gxe_pvalue_{}_{}_{}.txt'.format(chromosome, selectionStart, selectionEnd), sep = '\t', index = True) - else : - # add if statements - pvs_df.to_csv(output_dir + '/eqtl_gxe_pvalue_{}.txt'.format(chromosome), sep = '\t', index = True) - - - -if __name__=='__main__': - - #Variables - chunkFile = '/nfs/leia/research/stegle/mjbonder/ChunkFiles/Ensembl_75_Limix_Annotation_FC_Gene_step10.txt' - genotypeFile = '/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed' - annotationFile = '/hps/nobackup/hipsci/scratch/singlecell_endodiff/data_processed/scQTLs/annos/combined_feature_id_annos.tsv' - phenotypeFile = '/nfs/leia/research/stegle/acuomo/singlecell_endodiff/data/exploratory_analysis/input_files/full_sc_exprs_pheno.tsv' - covariateFile = '/nfs/leia/research/stegle/acuomo/singlecell_endodiff/data/exploratory_analysis/input_files/full_sc_exprs_10pcs_covs.tsv' - environmentFile = '/nfs/leia/research/stegle/acuomo/singlecell_endodiff/data/exploratory_analysis/input_files/full_sc_exprs_10pcs_envs.tsv' - kinshipFile = '/hps/nobackup/hipsci/scratch/genotypes/imputed/2017-03-27/Full_Filtered_SNPs_Plink-F/hipsci.wec.gtarray.HumanCoreExome.imputed_phased.20170327.genotypes.norm.renamed.kinship' - sampleMappingFile = '/nfs/leia/research/stegle/acuomo/singlecell_endodiff/data/exploratory_analysis/input_files/full_sc_exprs_samples.tsv' - featureVariantFilter = '/nfs/leia/research/stegle/acuomo/singlecell_endodiff/data/exploratory_analysis/input_files/full_sc_exprs_merged_leads_top10_perstage.tsv' - blockSize = '500' - output_dir = '/nfs/leia/research/stegle/acuomo/singlecell_endodiff/data/pipeline_snakemakes/output_structlmm_alldays_10PCs/' - - run_structLMM2_int(phenotypeFile, annotationFile, environmentFile, genotypeFile, - output_dir, sample_mapping_filename = sampleMappingFile, snp_feature_filename = featureVariantFilter, - kinship_filename = kinshipFile, covariates_filename = covariateFile) - - - diff --git a/setup.cfg b/setup.cfg index 461c097..ebfda63 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,55 +6,52 @@ classifiers = License :: OSI Approved :: MIT License Operating System :: OS Independent Programming Language :: Python -description = StructLMM that accounts for population structure. -download_url = https://github.com/limix/struct-lmm2 -keywords = function, optimisation +description = A linear mixed model framework to map multivariate context-specific eQTL using single-cell RNA-seq data. +download_url = https://github.com/limix/CellRegMap +keywords = scRNA-seq, eQTL, context-specific regulation license = MIT long_description = file: README.md long_description_content_type = text/markdown maintainer = Danilo Horta platforms = Windows, MacOS, Linux maintainer_email = horta@ebi.ac.uk -name = struct-lmm2 -url = https://github.com/limix/struct-lmm2 +name = cellregmap +url = https://github.com/limix/CellRegMap version = attr: version.get [options] zip_safe = True include_package_data = True packages = find: -setup_requires = - pytest-runner>=4.2 +python_requires = >= 3.6 install_requires = - brent-search>=1.0.33 - cachetools>=2.1.0 - liknorm>=1.2.1 - numpy-sugar>=1.4.1 - numpy>=1.14.3 - pandas>=0.23.1 - pytest>=3.6.3 - pytest-doctestplus>=0.3.0 - scipy>=1.0.1 - chiscore>=0.2.0 - struct-lmm>=0.3.1 + chiscore>=0.2.3 + glimix-core>=3.1.12 + numpy-sugar>=1.5.1 + numpy>=1.17.5 + pytest>=5.4.3 + scipy>=1.2.3 + tqdm>=4.41.1 [aliases] test = pytest [tool:pytest] addopts = - --doctest-plus --doctest-modules --doctest-glob='*.rst' --ignore="setup.py" - --ignore="doc/conf.py" doctest_plus = enabled -doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS ALLOW_UNICODE FLOAT_CMP -doctest_plus_atol = 1e-03 -doctest_plus_rtol = 1e-03 +doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS ALLOW_UNICODE doctest_rst = enabled -norecursedirs = .eggs .git *.egg-info build .ropeproject .undodir -pep8ignore = E402 W0212 W0622 R0915 +norecursedirs = .eggs .git *.egg-info build .ropeproject .undodir old_files + +[flake8] +ignore = E501,E741,E203,W503,W0212,W0622,R0915,E743 + +[pycodestyle] +ignore = E741,E743,E203 +max-line-length = 88 [tool:isort] multi_line_output=3 @@ -66,10 +63,6 @@ line_length=88 [pylint] disable = redefined-builtin,R0915 -[flake8] -ignore = E501 E741 E203 W503 W0212 W0622 R0915 -max-line-length = 88 - [rstcheck] ignore_substitutions = today, version ignore_directives = plot, autofunction, command-output, autmodule, automodule, autoclass, autoattribute, automethod, doctest diff --git a/setup.py b/setup.py index f4ebca1..2553605 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,6 @@ -from setuptools import setup +#!/usr/bin/env python +from setuptools import setup -setup( - name="struct-lmm2", - maintainer="Anna", - description="Nice package", - author="Anna", - license="BSD", - python_requires=">=3.6", - install_requires=["numpy>=1.6", "scipy", "glimix_core", "numpy-sugar", "chiscore"], -) +if __name__ == "__main__": + setup() diff --git a/simulations1.py b/simulations1.py deleted file mode 100644 index d8734d3..0000000 --- a/simulations1.py +++ /dev/null @@ -1,506 +0,0 @@ -import sys - -import numpy as np -from numpy import ( - asarray, - concatenate, - empty, - eye, - inf, - newaxis, - ones, - sqrt, - stack, - trace, - where, - zeros, -) -from numpy.linalg import eigvalsh, inv, solve -from numpy.random import RandomState -from numpy_sugar.linalg import ddot, economic_qs, economic_svd -from scipy.linalg import sqrtm - -from chiscore import davies_pvalue, optimal_davies_pvalue -from glimix_core.lmm import LMM -from struct_lmm import StructLMM - - -""" sample phenotype from the model: - - 𝐲 = W𝛂 + 𝐠𝛽 + 𝐠⊙𝛃 + 𝐞 + u + 𝛆 - - 𝛃 ∼ 𝓝(𝟎, b²Σ) - 𝐞 ∼ 𝓝(𝟎, e²Σ) - 𝛆 ∼ 𝓝(𝟎, 𝜀²I) - Σ = EEᵀ - u ~ 𝓝(𝟎, g²K) - -""" - - -def _mod_liu(q, w): - from chiscore import liu_sf - - (pv, dof_x, _, info) = liu_sf(q, w, [1] * len(w), [0] * len(w), True) - return (pv, info["mu_q"], info["sigma_q"], dof_x) - - -def _qmin(pliumod): - from numpy import zeros - import scipy.stats as st - - # T statistic - T = pliumod[:, 0].min() - - qmin = zeros(pliumod.shape[0]) - percentile = 1 - T - for i in range(pliumod.shape[0]): - q = st.chi2.ppf(percentile, pliumod[i, 3]) - mu_q = pliumod[i, 1] - sigma_q = pliumod[i, 2] - dof = pliumod[i, 3] - qmin[i] = (q - dof) / (2 * dof) ** 0.5 * sigma_q + mu_q - - return qmin - - -# Let Σ = 𝙴𝙴ᵀ -# 𝐲 ∼ 𝓝(𝙼𝛂, 𝓋₀𝙳(ρ𝟏𝟏ᵀ + (1-ρ)Σ)𝙳 + 𝓋₁(aΣ + (1-a)𝙺) + 𝓋₂𝙸). - -seed = int(sys.argv[1]) -random = RandomState(seed) # set a seed to replicate simulations -# set sample size -n_samples = 500 -# simulate MAF (minor allele frequency) distribution -maf_min = 0.05 -maf_max = 0.45 -n_snps = 20 - -print(n_samples, "samples") -print(n_snps, "snps") -print(maf_min, "min MAF") -print(maf_max, "max MAF") - -"simulate environments" - -# two groups -group_size = n_samples // 2 - -E = zeros((n_samples, 2)) - -E[:group_size, 0] = 1 -E[group_size:, 1] = 1 - -Sigma = E @ E.T - -# import pdb; pdb.set_trace() - -"simulate genotypes (for n_snps variants)" - - -# Simulate genotypes (for n_snps variants) -mafs = random.rand(n_snps) * (maf_max - maf_min) + maf_min - -# simulate SNPs accordingly -G = [] - -for maf in mafs: - g = random.choice( - [0, 1, 2], - p=[(1 - maf) ** 2, 1 - ((1 - maf) ** 2 + maf ** 2), maf ** 2], - size=n_samples, - ) - G.append(asarray(g, float)) - -# We normalize it such that the expectation of 𝔼[𝐠ᵀ𝐠] = 1. -# i.e. normalize columns - -G = stack(G, axis=1) -G -= G.mean(0) -G /= G.std(0) - -G0 = G.copy() -G0 /= sqrt(G0.shape[1]) -K = G0 @ G0.T - - -"simulate two SNPs to have persistent effects and two to have interaction effects" -"one SNP in common, one unique to each category" - -idxs_persistent = [5, 6] -idxs_gxe = [10, 11] - -print("MAFs of causal SNPs") - -print("{}\t{}".format(idxs_persistent[0], mafs[idxs_persistent[0]])) -print("{}\t{}".format(idxs_persistent[1], mafs[idxs_persistent[1]])) -print("{}\t{}".format(idxs_gxe[0], mafs[idxs_gxe[0]])) -print("{}\t{}".format(idxs_gxe[1], mafs[idxs_gxe[1]])) - -# idxs_persistent = [5, 30] -# idxs_gxe = [30, 45] - -# Variances -# -# 𝐲 ∼ 𝓝(1 + ∑ᵢ𝐠ᵢ𝛽_gᵢ, σ²_g⋅𝙳𝟏𝟏ᵀ𝙳 + σ²_gxe⋅𝙳Σ𝙳𝙳 + σ²_e⋅Σ + σ²_k⋅𝙺 + σ²_n⋅𝙸. -# σ²_g + σ²_gxe + σ²_e + σ²_k + σ²_n = 1 -# σ²₁*ρ + σ²₁*(1-ρ) + σ²_e + σ²_k + σ²_n = 1 - -# The user will provide: σ²₁, ρ -# And we assume that σ²_e = σ²_k = σ²_n = v -# v = (1 - σ²₁*ρ + σ²₁*(1-ρ)) / 3 -# σ²_e = a*σ²₂ -# σ²_k = (1-a)*σ²₂ - - -"simulate sigma parameters" - -rho = 0.8 # contribution of interactions (proportion) -var_tot_g_gxe = 0.4 - -print(rho, "rho (prop var explained by GxE)") -print(var_tot_g_gxe, "tot variance G + GxE") - -var_tot_g = (1 - rho) * var_tot_g_gxe -var_tot_gxe = rho * var_tot_g_gxe - -var_g = var_tot_g / len(idxs_persistent) # split effect across n signals -var_gxe = var_tot_gxe / len(idxs_gxe) - -v = (1 - var_tot_gxe - var_tot_g) / 3 -var_e = v # environment effect only -var_k = v # population structure effect ? -var_noise = v -# print(v) - -""" (persistent) genotype portion of phenotype: - - 𝐲_g = G 𝛃_g - - 𝐲_g = ∑ᵢ𝐠ᵢ𝛽_gᵢ, - - where 𝐠ᵢ is the i-th column of 𝙶. - -""" - - -# simulate (persistent) beta to have causal SNPs as defined -beta_g = zeros(n_snps) -beta_g[idxs_persistent] = random.choice([+1, -1], size=len(idxs_persistent)) -beta_g /= beta_g.std() -beta_g *= var_g - -"calculate genoytpe component of y" - -y_g = G @ beta_g - - -""" GxE portion of phenotype: - - 𝐲_gxe = ∑ᵢ gᵢ x 𝛃ᵢ - -""" -# simulate (GxE) variance component to have causal SNPs as defined -sigma_gxe = zeros(n_snps) -sigma_gxe[idxs_gxe] = var_gxe - -# for i in range(n_snps): -# print('{}\t{}'.format(i,sigma_gxe[i])) - -y_gxe = zeros(n_samples) -u_gxe = ones(n_samples) -u_gxe[group_size:] = -1 - -for i in range(n_snps): - # beta_gxe = random.multivariate_normal(zeros(n_samples), sigma_gxe[i] * Sigma) - beta_gxe = sigma_gxe[i] * u_gxe - y_gxe += G[:, i] * beta_gxe - - -e = random.multivariate_normal(zeros(n_samples), v * Sigma) -u = random.multivariate_normal(zeros(n_samples), v * K) -eps = random.multivariate_normal(zeros(n_samples), v * eye(n_samples)) - -e0 = random.multivariate_normal(zeros(n_samples), v * 3 / 2 * Sigma) -eps0 = random.multivariate_normal(zeros(n_samples), v * 3 / 2 * eye(n_samples)) - - -"sum all parts of y" -y = 1 + y_g + y_gxe + e + u + eps -y0 = 1 + y_g + y_gxe + e0 + eps0 - - -p_values0 = [] -p_values1 = [] -p_values2 = [] -p_values3 = [] - -print("testing using standard structLMM") - -"test using struct LMM (standard)" - - -# y = y.reshape(y.shape[0], 1) - -# "Association test" - -print( - "p-values of association test SNPs", - idxs_persistent, - idxs_gxe, - "should be causal (persistent + GxE)", -) -slmm = StructLMM(y0, M=np.ones(n_samples), E=E, W=E) -slmm.fit(verbose=False) - -for i in range(n_snps): - g = G[:, i] - g = g.reshape(g.shape[0], 1) - _p = slmm.score_2dof_assoc(g) - print("{}\t{}".format(i, _p)) - p_values0.append(_p) - -## "Interaction test" - -print("p-values of interaction test SNPs", idxs_gxe, "should be causal (GxE)") - -for i in range(n_snps): - g = G[:, i] - # g = g.reshape(g.shape[0],1) - M = np.ones(n_samples) - M = np.stack([M, g], axis=1) - slmm_int = StructLMM(y0, M=M, E=E, W=E) - slmm_int.fit(verbose=False) - _p = slmm_int.score_2dof_inter(g) - print("{}\t{}".format(i, _p)) - p_values1.append(_p) - - -################################################# -################################################# -################################################# -################################################# - -# print("using structLMM 2 now") - -# "test using struct LMM 2 (in this case it should not be very different)" - -## y = y.reshape(y.shape[0], 1) - -Cov = {} -QS_a = {} -M = ones((n_samples, 1)) - -# a_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] -a_values = [1] - -for a in a_values: - Cov[a] = a * Sigma + (1 - a) * K - QS_a[a] = economic_qs(Cov[a]) - -# "Association test" - -print( - "p-values of association test SNPs", - idxs_persistent, - idxs_gxe, - "should be causal (persistent + GxE)", -) - -rhos = [0.0, 0.1 ** 2, 0.2 ** 2, 0.3 ** 2, 0.4 ** 2, 0.5 ** 2, 0.5, 0.999] - -for i in range(n_snps): - # print(i) - g = G[:, i] - g = g.reshape(g.shape[0], 1) - best = {"lml": -inf, "a": 0, "v0": 0, "v1": 0, "beta": 0} - for a in a_values: - M = np.ones(n_samples) - lmm = LMM(y0, M, QS_a[a], restricted=True) # cov(y) = v0*(aΣ + (1-a)K) + v1*Is - lmm.fit(verbose=False) - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["a"] = a - best["v0"] = lmm.v0 - best["v1"] = lmm.v1 - best["alpha"] = lmm.beta - - "H0 optimal parameters" - alpha = lmm.beta[:-1] - beta = lmm.beta[-1] - # e²Σ + g²K = s²(aΣ + (1-a)K) - # e² = s²*a - # g² = s²*(1-a) - s2 = lmm.v0 # s² - eps2 = lmm.v1 # 𝜀² - - "H1 via score test" - # Let K₀ = g²K + e²Σ + 𝜀²I - # with optimal values e² and 𝜀² found above. - K0 = lmm.covariance() - X = concatenate((E, g), axis=1) - X = M.reshape((-1, 1)) - - # import pdb; pdb.set_trace() - # Let P₀ = K⁻¹ - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹. - K0iX = solve(K0, X) - P0 = inv(K0) - K0iX @ solve(X.T @ K0iX, K0iX.T) - - # P₀𝐲 = K⁻¹𝐲 - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹𝐲. - K0iy = solve(K0, y0) - P0y = K0iy - solve(K0, X @ solve(X.T @ K0iX, X.T @ K0iy)) - # self._P(self._y).min() - # -4.90270502291706 - - # import pdb; pdb.set_trace() - # The covariance matrix of H1 is K = K₀ + b²diag(𝐠)⋅Σ⋅diag(𝐠) - # We have ∂K/∂b² = diag(𝐠)⋅Σ⋅diag(𝐠) - # The score test statistics is given by - # Q = ½𝐲ᵀP₀⋅∂K⋅P₀𝐲 - - dK_G = ddot(g.ravel(), ddot(ones((n_samples, n_samples)), g.ravel())) - dK_GxE = ddot(g.ravel(), ddot(Sigma, g.ravel())) - # P0 = P0 + 1e-9 * eye(P0.shape[0]) - # sqrP0 = sqrtm(P0) - # import pdb; pdb.set_trace() - Q_G = P0y.T @ dK_G @ P0y - Q_GxE = P0y.T @ dK_GxE @ P0y - - # the eigenvalues of ½P₀⋅∂K⋅P₀ - # are tge eigenvalues of - gPg = g.T @ P0 @ g - goE = g * E - gPgoE = g.T @ P0 @ goE - gEPgE = goE.T @ P0 @ goE - - nE = E.shape[1] + 1 - F = empty((nE, nE)) - - # lambdas = zeros(len(rhos)) - lambdas = [] - Q = [] - for rho in rhos: - # print(ii) - # print(rho) - Q.append((rho * Q_G + (1 - rho) * Q_GxE) / 2) - # dK = rho * dK_GxE + (1 - rho) * dK_G - # lambdas.append(eigvalsh((sqrP0 @ dK @ sqrP0) / 2)) - # lambdas[ii] = eigvalsh(sqrP0 @ dK @ sqrP0) / 2 - F[0, 0] = rho * gPg - F[0, 1:] = sqrt(rho) * sqrt(1 - rho) * gPgoE - F[1:, 0] = F[0, 1:] - F[1:, 1:] = (1 - rho) * gEPgE - lambdas.append(eigvalsh(F) / 2) - - pliumod = stack([_mod_liu(Qi, lam) for Qi, lam in zip(Q, lambdas)], axis=0) - qmin = _qmin(pliumod) - - # 3. Calculate quantites that occur in null distribution - Px1 = P0 @ g - m = 0.5 * (g.T @ Px1) - goE = g * E - PgoE = P0 @ goE - ETxPxE = 0.5 * (goE.T @ PgoE) - ETxPx1 = goE.T @ Px1 - ETxPx11xPxE = 0.25 / m * (ETxPx1 @ ETxPx1.T) - ZTIminusMZ = ETxPxE - ETxPx11xPxE - eigh = eigvalsh(ZTIminusMZ) - - eta = ETxPx11xPxE @ ZTIminusMZ - vareta = 4 * trace(eta) - - OneZTZE = 0.5 * (g.T @ PgoE) - tau_top = OneZTZE @ OneZTZE.T - tau_rho = empty(len(rhos)) - - for ii in range(len(rhos)): - # print(ii) - # tau_rho[ii] = rhos[ii] * m + (1 - rhos[ii]) / m * tau_top - tau_rho[ii] = (1 - rhos[ii]) * m + (rhos[ii]) / m * tau_top - - MuQ = sum(eigh) - VarQ = sum(eigh ** 2) * 2 + vareta - KerQ = sum(eigh ** 4) / (sum(eigh ** 2) ** 2) * 12 - Df = 12 / KerQ - - # 4. Integration - T = pliumod[:, 0].min() - pvalue = optimal_davies_pvalue( - qmin, MuQ, VarQ, KerQ, eigh, vareta, Df, tau_rho, rhos, T - ) - - # Final correction to make sure that the p-value returned is sensible - multi = 3 - if len(rhos) < 3: - multi = 2 - idx = where(pliumod[:, 0] > 0)[0] - pval = pliumod[:, 0].min() * multi - if pvalue <= 0 or len(idx) < len(rhos): - pvalue = pval - if pvalue == 0: - if len(idx) > 0: - pvalue = pliumod[:, 0][idx].min() - - print("{}\t{}".format(i, pvalue)) - p_values2.append(pvalue) - # return pvalue - -# sys.exit(0) -"Interaction test" - - -print("p-values of interaction test SNPs", idxs_gxe, "should be causal (GxE)") - -for i in range(n_snps): - g = G[:, i] - g = g.reshape(g.shape[0], 1) - M = np.ones((n_samples, 1)) - Mg = concatenate((M, g), axis=1) - best = {"lml": -inf, "a": 0, "v0": 0, "v1": 0, "beta": 0} - for a in a_values: - lmm = LMM(y0, Mg, QS_a[a], restricted=True) # cov(y) = v0*(aΣ + (1-a)K) + v1*Is - lmm.fit(verbose=False) - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["a"] = a - best["v0"] = lmm.v0 - best["v1"] = lmm.v1 - best["alpha"] = lmm.beta - - "H0 optimal parameters" - alpha = lmm.beta[:-1] - beta = lmm.beta[-1] - # e²Σ + g²K = s²(aΣ + (1-a)K) - # e² = s²*a - # g² = s²*(1-a) - s2 = lmm.v0 # s² - eps2 = lmm.v1 # 𝜀² - - "H1 via score test" - # Let K₀ = g²K + e²Σ + 𝜀²I - # with optimal values e² and 𝜀² found above. - K0 = lmm.covariance() - X = concatenate((E, g), axis=1) - - # Let P₀ = K⁻¹ - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹. - K0iX = solve(K0, X) - P0 = inv(K0) - K0iX @ solve(X.T @ K0iX, K0iX.T) - - # P₀𝐲 = K⁻¹𝐲 - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹𝐲. - K0iy = solve(K0, y0) - P0y = K0iy - solve(K0, X @ solve(X.T @ K0iX, X.T @ K0iy)) - - # The covariance matrix of H1 is K = K₀ + b²diag(𝐠)⋅Σ⋅diag(𝐠) - # We have ∂K/∂b² = diag(𝐠)⋅Σ⋅diag(𝐠) - # The score test statistics is given by - # Q = ½𝐲ᵀP₀⋅∂K⋅P₀𝐲 - dK = ddot(g.ravel(), ddot(Sigma, g.ravel())) - Q = (P0y.T @ dK @ P0y) / 2 - - # Q is the score statistic for our interaction test and follows a linear combination - # of chi-squared (df=1) distributions: - # Q ∼ ∑λχ², where λᵢ are the non-zero eigenvalues of ½√P₀⋅∂K⋅√P₀. - sqrP0 = sqrtm(P0) - pval = davies_pvalue(Q, (sqrP0 @ dK @ sqrP0) / 2) - print("{}\t{}".format(i, pval)) - p_values3.append(pval) diff --git a/struct_lmm2/__init__.py b/struct_lmm2/__init__.py deleted file mode 100644 index 5a07dae..0000000 --- a/struct_lmm2/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .struct_lmm2 import StructLMM2 - -__version__ = "0.0.1" diff --git a/struct_lmm2/struct_lmm2.py b/struct_lmm2/struct_lmm2.py deleted file mode 100644 index aecd737..0000000 --- a/struct_lmm2/struct_lmm2.py +++ /dev/null @@ -1,178 +0,0 @@ -import numpy as np -from numpy import concatenate, inf, newaxis -from numpy.linalg import eigvalsh, inv, solve -from numpy.random import RandomState -from numpy_sugar.linalg import ddot, economic_qs, economic_svd -from scipy.linalg import sqrtm - -from chiscore import davies_pvalue, optimal_davies_pvalue -from glimix_core.lmm import LMM - - -class StructLMM2: - r""" - Mixed-model with genetic effect heterogeneity. - - The extended StructLMM model (two random effects) is - - 𝐲 = 𝙼𝛂 + 𝐠𝛽 + 𝐠⊙𝛃 + 𝐞 + 𝐮 + 𝛆, (1) - - where - - (𝐠⊙𝛃)ᵢ = 𝐠ᵢ𝛃ᵢ - 𝛽 ∼ 𝓝(0, 𝓋₀ρ₀), - 𝛃 ∼ 𝓝(𝟎, 𝓋₀(1-ρ₀)𝙴𝙴ᵀ), - 𝐞 ∼ 𝓝(𝟎, 𝓋₁ρ₁𝚆𝚆ᵀ), - 𝐮 ~ 𝓝(𝟎, 𝓋₁(1-ρ₁)𝙺), and - 𝛆 ∼ 𝓝(𝟎, 𝓋₂𝙸). - - The matrices 𝙴 and 𝚆 are generally the same, and represent the environment - configuration for each sample. - The parameter ρ ∈ [𝟶, 𝟷] dictates the relevance of genotype-environment interaction - versus the genotype effect alone. - The term 𝐞 accounts for additive environment-only effects while 𝛆 accounts for - noise effects. - The term 𝐮 accounts for population structure. - - The above model is equivalent to - - 𝐲 = 𝙼𝛂 + 𝐠⊙𝛃 + 𝐞 + 𝐮 + 𝛆, (2) - - where - - 𝛃 ∼ 𝓝(𝟎, 𝓋₀(ρ₀𝟏𝟏ᵀ + (1-ρ₀)𝙴𝙴ᵀ)), - 𝐞 ∼ 𝓝(𝟎, 𝓋₁ρ₁𝚆𝚆ᵀ), - 𝐮 ~ 𝓝(𝟎, 𝓋₁(1-ρ₁)𝙺), and - 𝛆 ∼ 𝓝(𝟎, 𝓋₂𝙸). - - Notice that the 𝛃 in Eqs. (1) and (2) are not the same. - Its marginalised form is given by - - 𝐲 ∼ 𝓝(𝙼𝛂, 𝓋₀𝙳(ρ₀𝟏𝟏ᵀ + (1-ρ₀)𝙴𝙴ᵀ)𝙳 + 𝓋₁(ρ₁𝚆𝚆ᵀ + (1-ρ₁)𝙺) + 𝓋₂𝙸), - - where 𝙳 = diag(𝐠). - - StructLMM method is used to perform two types of statistical tests. - The association one compares the following hypotheses: - - 𝓗₀: 𝓋₀ = 0 - 𝓗₁: 𝓋₀ > 0 - - 𝓗₀ denotes no genetic association, while 𝓗₁ models any genetic association. - In particular, 𝓗₁ includes genotype-environment interaction as part of genetic - association. - The interaction test is slightly more complicated as the term 𝐠𝛽 in Eq. (1) is now - considered a fixed one. - In pratice, however, we instead include 𝐠 in the covariates matrix 𝙼 and set ρ = 0 - in Eq. (2). - We refer to this modified model as the interaction model. - The compared hypotheses are: - - 𝓗₀: 𝓋₀ = 0 (given the interaction model) - 𝓗₁: 𝓋₀ > 0 (given the interaction model) - - Implementation - -------------- - - Let Σ₀ = 𝙴𝙴ᵀ and Σ₁ = 𝚆𝚆ᵀ. - Therefore, - - 𝐲 ∼ 𝓝(𝙼𝛂, 𝓋₀𝙳(ρ₀𝟏𝟏ᵀ + (1-ρ₀)Σ₀)𝙳 + 𝓋₁(ρ₁Σ₁ᵀ + (1-ρ₁)𝙺) + 𝓋₂𝙸). - - Let C(ρ₁) = - """ - - def __init__(self, y, M, E, W=None, K=None): - - # The above model is equivalent to - - # 𝐲 = 𝙼𝛂 + 𝐠⊙𝛃 + 𝐞 + 𝐮 + 𝛆, (2) - - # where - - # 𝛃 ∼ 𝓝(𝟎, 𝓋₀(ρ𝟏𝟏ᵀ + (1-ρ)𝙴𝙴ᵀ)), - # 𝐞 ∼ 𝓝(𝟎, 𝓋₁𝚆𝚆ᵀ), - # 𝐮 ~ 𝓝(𝟎, g²𝙺), and - # 𝛆 ∼ 𝓝(𝟎, 𝓋₂𝙸). - if W is None: - W = E - - self._y = y - self._M = M - self._E = E - self._W = W - self._K = K - self._Sigma0 = E @ E.T - self._Sigma1 = W @ W.T - - self._rho0 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] - self._rho1 = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] - - self.Cov = {} - self.QS_a = {} - for a in self.a_values: - self.Cov[a] = a * self.Sigma + (1 - a) * self.K - self.QS_a[a] = economic_qs(self.Cov[a]) - - def fit_null(self, g): - self.X = concatenate((self.W, g[:, newaxis]), axis=1) - best = {"lml": -inf, "a": 0, "v0": 0, "v1": 0, "beta": 0} - for a in self.a_values: - # cov(y) = v0*(aΣ + (1-a)K) + v1*I - lmm = LMM(self.y, self.X, self.QS_a[a], restricted=True) - lmm.fit(verbose=False) - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["a"] = a - best["v0"] = lmm.v0 - best["v1"] = lmm.v1 - best["alpha"] = lmm.beta - best["covariance"] = lmm.covariance() - self.best = best - - def score_2_dof(self, g): - alpha = self.best["alpha"][:-1] - beta = self.best["alpha"][-1] - # e²Σ + g²K = s²(aΣ + (1-a)K) - # e² = s²*a - # g² = s²*(1-a) - s2 = self.best["v0"] # s² - eps2 = self.best["v1"] # 𝜀² - - # H1 via score test - # Let K₀ = g²K + e²Σ + 𝜀²I - # with optimal values e² and 𝜀² found above. - K0 = self.best["covariance"] - - # Let P₀ = K⁻¹ - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹. - K0iX = solve(K0, self.X) - P0 = inv(K0) - K0iX @ solve(self.X.T @ K0iX, K0iX.T) - - # P₀𝐲 = K⁻¹𝐲 - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹𝐲. - K0iy = solve(K0, self.y) - P0y = K0iy - solve(K0, self.X @ solve(self.X.T @ K0iX, self.X.T @ K0iy)) - - # The covariance matrix of H1 is K = K₀ + b²diag(𝐠)⋅Σ⋅diag(𝐠) - # We have ∂K/∂b² = diag(𝐠)⋅Σ⋅diag(𝐠) - # The score test statistics is given by - # Q = ½𝐲ᵀP₀⋅∂K⋅P₀𝐲 - dK = ddot(g, ddot(self.Sigma, g)) - Q = (P0y.T @ dK @ P0y) / 2 - - # Q is the score statistic for our interaction test and follows a linear combination - # of chi-squared (df=1) distributions: - # Q ∼ ∑λχ², where λᵢ are the non-zero eigenvalues of ½√P₀⋅∂K⋅√P₀. - sqrP0 = sqrtm(P0) - # lambdas = eigvalsh((sqrP0 @ dK @ sqrP0) / 2) - # lambdas = lambdas[lambdas > epsilon.small] - # print(lambdas) - # print(Q) - pval = davies_pvalue(Q, (sqrP0 @ dK @ sqrP0) / 2) - return pval - - -def _mod_liu(q, w): - from chiscore import liu_sf - - (pv, dof_x, _, info) = liu_sf(q, w, [1] * len(w), [0] * len(w), True) - return (pv, info["mu_q"], info["sigma_q"], dof_x) diff --git a/struct_lmm2/test/test_struct_lmm2.py b/struct_lmm2/test/test_struct_lmm2.py deleted file mode 100644 index 193fc68..0000000 --- a/struct_lmm2/test/test_struct_lmm2.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import scipy.stats as stats -from numpy import ones -from numpy.random import RandomState -from numpy.testing import assert_allclose - -from struct_lmm import StructLMM -from struct_lmm2 import StructLMM2 - - -def test_structlmm_int_2kinships(): - random = RandomState(0) - n = 100 - c = 2 - y = random.randn(n) - M = ones((n, 1)) - W = random.randn(n, c) - E = random.randn(n, 4) - - p_values1 = [] - p_values2 = [] - - for i in range(10): - random = RandomState(i) - g = random.randn(n) - # X = np.concatenate((W, g[:, np.newaxis]), axis=1) - lmm2 = StructLMM2(y, W, E) - lmm2.fit_null(g) - _p = lmm2.score_2_dof(g) - p_values2.append(_p) - - y = y.reshape(y.shape[0], 1) - lmm = StructLMM(y, M, E) - g = g.reshape(g.shape[0], 1) - covs1 = np.hstack((W, g)) - lmm.fit(covs1) - _p = lmm.score_2dof_inter(g) - p_values1.append(_p) - - print(stats.pearsonr(p_values2, p_values1)[0]) - assert_allclose(p_values2, p_values1, rtol=1e-4) diff --git a/struct_lmm_int_2kinships.py b/struct_lmm_int_2kinships.py deleted file mode 100644 index 00b1cbd..0000000 --- a/struct_lmm_int_2kinships.py +++ /dev/null @@ -1,160 +0,0 @@ -""" -Mixed-model with genetic effect heterogeneity. - -The extended StructLMM model (two random effects) is :: - - 𝐲 = W𝛂 + 𝐠𝛽 + 𝐠⊙𝛃 + 𝐞 + u + 𝛆, - -where :: - - 𝐠⊙𝛃 = ∑ᵢ𝐠ᵢ𝛽ᵢ - 𝛃 ∼ 𝓝(𝟎, b²Σ) - 𝐞 ∼ 𝓝(𝟎, e²Σ) - 𝛆 ∼ 𝓝(𝟎, 𝜀²I) - Σ = EEᵀ - u ~ 𝓝(𝟎, g²K) - -If one considers 𝛽 ∼ 𝓝(0, p²), we can insert -𝛽 into 𝛃 :: - - 𝛃_ ∼ 𝓝(𝟎, p²𝟏𝟏ᵀ + b²Σ) - -""" -import numpy as np -from numpy import concatenate, inf, newaxis -from numpy.linalg import eigvalsh, inv, solve -from numpy.random import RandomState -from numpy_sugar import epsilon -from numpy_sugar.linalg import ddot, economic_qs, economic_svd -from scipy.linalg import sqrtm -from chiscore import davies_pvalue, mod_liu, optimal_davies_pvalue - -from glimix_core.lmm import LMM - -from time import time -start = time() - -random = RandomState(0) -n = 10 -c = 2 -y = random.randn(n) -W = random.randn(n, c) -# g = random.randn(n) -E = random.randn(n, 4) -Sigma = E @ E.T -X = random.randn(n, 5) -K = X @ X.T - -# print() - -# 𝐲 = W𝛂 + 𝐠𝛽 + 𝐠⊙𝛃 + 𝐞 + 𝛆, -# 𝐞 ∼ 𝓝(𝟎, e²Σ) -# Σ = EEᵀ -# QS = economic_qs(Sigma) - -# 𝐲 = W𝛂 + 𝐞 + 𝛆 -# 𝓝(𝐲 | W𝛂, e²Σ + 𝜀²I) - -# precompute weighted sum of Sigma and K for set values of a -Cov = {} -QS_a = {} - -a_values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] - -for a in a_values: - Cov[a] = a * Sigma + (1 - a) * K - QS_a[a] = economic_qs(Cov[a]) - # print(QS_a[a][0][1].shape[1]) - - -# print(time() - start) -""" -Interaction test ----------------- -H0: b² = 0 => 𝐲 = W𝛂 + 𝐠𝛽 + 𝐞 + u + 𝛆 - 𝐲 ∼ 𝓝(W𝛂 + 𝐠𝛽, e²Σ + g²K + 𝜀²I) -H1: b² > 0 => 𝐲 = W𝛂 + 𝐠𝛽 + 𝐠⊙𝛃 + 𝐞 + u + 𝛆 - 𝐲 ∼ 𝓝(W𝛂 + 𝐠𝛽, e²Σ + g²K + 𝜀²I + b²Σ) -""" -start = time() -for i in range(20): - print(i) - random = RandomState(i) - g = random.randn(n) - X = concatenate((W, g[:, newaxis]), axis = 1) - # X_SVD = economic_svd(X) - best = {"lml": -inf, "a": 0, "v0": 0, "v1": 0, "beta": 0} - for a in a_values: - # cov(y) = v0*(aΣ + (1-a)K) + v1*I - lmm = LMM(y, X, QS_a[a], restricted = True) - lmm.fit(verbose = False) - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["a"] = a - best["v0"] = lmm.v0 - best["v1"] = lmm.v1 - best["alpha"] = lmm.beta - - # The way LMM represents: 𝓝(y|Xb, scale * ((1-δ)K + δI)) - # lmm.delta = 0.1 - # lmm.scale = 3.4 - # lmm.fix("scale") - # lmm.fix("delta") - -# H0 optimal parameters - alpha = lmm.beta[:-1] - beta = lmm.beta[-1] - # e²Σ + g²K = s²(aΣ + (1-a)K) - # e² = s²*a - # g² = s²*(1-a) - s2 = lmm.v0 # s² - eps2 = lmm.v1 # 𝜀² - - # H1 via score test - # Let K₀ = g²K + e²Σ + 𝜀²I - # with optimal values e² and 𝜀² found above. - K0 = lmm.covariance() - - # Let P₀ = K⁻¹ - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹. - K0iX = solve(K0, X) - P0 = inv(K0) - K0iX @ solve(X.T @ K0iX, K0iX.T) - - # P₀𝐲 = K⁻¹𝐲 - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹𝐲. - K0iy = solve(K0, y) - P0y = K0iy - solve(K0, X @ solve(X.T @ K0iX, X.T @ K0iy)) - - # The covariance matrix of H1 is K = K₀ + b²diag(𝐠)⋅Σ⋅diag(𝐠) - # We have ∂K/∂b² = diag(𝐠)⋅Σ⋅diag(𝐠) - # The score test statistics is given by - # Q = ½𝐲ᵀP₀⋅∂K⋅P₀𝐲 - dK = ddot(g, ddot(Sigma, g)) - Q = (P0y.T @ dK @ P0y) / 2 - - # Q is the score statistic for our interaction test and follows a linear combination - # of chi-squared (df=1) distributions: - # Q ∼ ∑λχ², where λᵢ are the non-zero eigenvalues of ½√P₀⋅∂K⋅√P₀. - sqrP0 = sqrtm(P0) - # lambdas = eigvalsh((sqrP0 @ dK @ sqrP0) / 2) - # lambdas = lambdas[lambdas > epsilon.small] - # print(lambdas) - # print(Q) - pval = davies_pvalue(Q, (sqrP0 @ dK @ sqrP0) / 2) - print(pval) - # p_values2[i] = pval - - -# print(time() - start) -# print((time() - start)/100) - - # # compare to StructLMM (int) - - from struct_lmm import StructLMM - y = y.reshape(y.shape[0],1) - slmm_int = StructLMM(y, E, W = E, rho_list = [0]) - g = g.reshape(g.shape[0],1) - covs1 = np.hstack((W, g)) - null = slmm_int.fit_null(F = covs1, verbose = False) - _p = slmm_int.score_2_dof(g) - # p_values1[i] = _p - print(_p) - diff --git a/version.py b/version.py new file mode 100644 index 0000000..a3bbb9e --- /dev/null +++ b/version.py @@ -0,0 +1,17 @@ +import re +from os.path import join + +from setuptools import find_packages + + +def get(): + pkgnames = find_packages() + if len(pkgnames) == 0: + return "unknown" + pkgname = pkgnames[0] + content = open(join(pkgname, "__init__.py")).read() + c = re.compile(r"__version__ *= *('[^']+'|\"[^\"]+\")") + m = c.search(content) + if m is None: + return "unknown" + return m.groups()[0][1:-1]