diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml deleted file mode 100644 index ade52e5..0000000 --- a/.github/workflows/deploy.yaml +++ /dev/null @@ -1,49 +0,0 @@ -name: Deploy CellRegMap - -on: - push: - tags: - - v* - -jobs: - deploy: - runs-on: ubuntu-latest - steps: - - name: Get tag name - run: | - echo ${GITHUB_REF##*/} - - # clone the repo to the local directory - - uses: actions/checkout@v2 - - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: "3.8" - - # In the future, run the tests here - # - name: Test with pytest - # run: | - # python -m unittest test/ - - # Deploy to pypi - - name: Build and publish to pypi - env: - TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} - TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} - run: | - pip install wheel twine - python setup.py sdist bdist_wheel - twine upload dist/* - - - name: Login to Docker Hub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - # Build and deploy the docker - - name: Build and publish to docker - run: | - DOCKER_TAG=annasecuomo/cellregmap:${GITHUB_REF##*/} - docker build -t $DOCKER_TAG . - docker push $DOCKER_TAG diff --git a/.gitignore b/.gitignore deleted file mode 100644 index ffbcc37..0000000 --- a/.gitignore +++ /dev/null @@ -1,125 +0,0 @@ -docs/_autosummary - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -.hypothesis/ -.pytest_cache/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don’t work, or not -# install all needed dependencies. -#Pipfile.lock - -# celery beat schedule file -celerybeat-schedule - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 7eba10b..0000000 --- a/Dockerfile +++ /dev/null @@ -1,5 +0,0 @@ -FROM python:3.8.5 - -ADD cellregmap /app/cellregmap/cellregmap -ADD setup.py setup.cfg version.py proof.md LICENSE MANIFEST.in /app/cellregmap/ -RUN pip install /app/cellregmap diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 97fb9fb..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2021 Anna Cuomo - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100755 index 762aa52..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -include LICENSE -include README.md -include version.py diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..e8cfda9 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,18 @@ +--- +layout: default +title: NEWS +--- + +**`CellRegMap paper out in MSB!`**: +- CellRegMap is out in Molecular Systems Biology! Check out the [paper](https://www.embopress.org/doi/full/10.15252/msb.202110663) and see here for a [Tweetorial](https://twitter.com/AnnaSECuomo/status/1559805188994580482)! + +**`CellRegMap pipeline`**: +- We have developed a [WDL pipeline](https://github.com/populationgenomics/CellRegMap_pipeline) to facilitate running CellRegMap +- download the container from [Dockerhub](https://hub.docker.com/repository/docker/annasecuomo/cellregmap_pipeline)! + +**`Anna's presentation of CellRegMap at Biology of Genomes`**: +- Catch Anna's talk at [this link](https://www.biorxiv.org/content/10.1101/2021.09.01.458524v1#video)! + +**`CellRegMap preprint`**: +- the Cellular Regulatory Map (CellRegMap) is out as a [preprint](https://www.biorxiv.org/content/10.1101/2021.09.01.458524v1)! +- See our [Twitter thread](https://twitter.com/AnnaSECuomo/status/1434059443956862978) for highlights! diff --git a/README.md b/README.md index 1bf02e0..da965af 100644 --- a/README.md +++ b/README.md @@ -1,47 +1 @@ -# CellRegMap - -Cellular Regulatory Map (CellRegMap) is a linear mixed model approach to perform multi-context eQTL mapping by leveraging single cell RNA sequencing (scRNA-seq) data. -It is related to the previously proposed [StructLMM](https://www.nature.com/articles/s41588-018-0271-0) but importantly it can account for sample structure, including population structure and repeated observations for the same samples, e.g., multiple cells for the same donor. - -The CellRegMap model and its applications to both real and simulated data are described in the [CellRegMap manuscript](https://www.embopress.org/doi/full/10.15252/msb.202110663). - -For more instructions and tutorials to facilitate usage of the package visit [our website](https://limix.github.io/CellRegMap/)! - -## Install - -From your command line, enter - - python3 -m pip install cellregmap - -in your command line. - -## Development - -To install it in development mode, enter - - git clone https://github.com/limix/CellRegMap.git - cd CellRegMap - python3 -m pip install -e . - -in your command line. - -## Installation using a Docker image -If you use Docker, you can also pull the [pre-build image from dockerhub](https://hub.docker.com/r/annasecuomo/cellregmap). - - - +Branch for the [webpage](https://limix.github.io/CellRegMap) associated with our [CellRegMap](https://www.biorxiv.org/content/10.1101/2021.09.01.458524v1) model. diff --git a/__pycache__/version.cpython-38.pyc b/__pycache__/version.cpython-38.pyc new file mode 100644 index 0000000..565aebe Binary files /dev/null and b/__pycache__/version.cpython-38.pyc differ diff --git a/__pycache__/version.cpython-39.pyc b/__pycache__/version.cpython-39.pyc new file mode 100644 index 0000000..5cbe6c0 Binary files /dev/null and b/__pycache__/version.cpython-39.pyc differ diff --git a/_config.yml b/_config.yml new file mode 100644 index 0000000..04ce92d --- /dev/null +++ b/_config.yml @@ -0,0 +1,6 @@ +theme: jekyll-theme-cayman +remote_theme: pages-themes/cayman@v0.2.0 +plugins: +- jekyll-remote-theme # add this line to the plugins list if you already have one +# Build settings +markdown: kramdown diff --git a/_layouts/default.html b/_layouts/default.html new file mode 100644 index 0000000..83940de --- /dev/null +++ b/_layouts/default.html @@ -0,0 +1,59 @@ + + + + + + +{% seo %} + + + + + + + + + + + + +
+ {{ content }} + + + +
+ + diff --git a/assets/css/style.scss b/assets/css/style.scss new file mode 100644 index 0000000..5742f93 --- /dev/null +++ b/assets/css/style.scss @@ -0,0 +1,4 @@ +--- +--- + +@import "{{ site.theme }}"; diff --git a/cellregmap.egg-info/PKG-INFO b/cellregmap.egg-info/PKG-INFO new file mode 100644 index 0000000..94bc3e2 --- /dev/null +++ b/cellregmap.egg-info/PKG-INFO @@ -0,0 +1,59 @@ +Metadata-Version: 2.1 +Name: cellregmap +Version: 0.0.2 +Summary: A linear mixed model framework to map multivariate context-specific eQTL using single-cell RNA-seq data. +Home-page: https://github.com/limix/CellRegMap +Author: Anna Cuomo, Danilo Horta +Author-email: acuomo@ebi.ac.uk +Maintainer: Danilo Horta +Maintainer-email: horta@ebi.ac.uk +License: MIT +Download-URL: https://github.com/limix/CellRegMap +Description: # CellRegMap + + Cellular Regulatory Map, a linear mixed model approach to perform multi-context eQTL mapping leveraging single cell RNA sequencing (scRNA-seq) data. + Similar to [StructLMM](https://www.nature.com/articles/s41588-018-0271-0) but importantly now accounting for sample structure, including population structure and repeated observations for the same samples, e.g., multiple cells for the same donor. + + The CellRegMap model and its applications to both real and simulated data are described in the CellRegMap manuscript. + + ## Install from source + + From your command line, enter + + git clone https://github.com/limix/CellRegMap.git + cd CellRegMap + pip install -e . + + to install it in development mode. + Otherwise, enter + + pip install . + + ## Running tests + + From your command line, enter + + python setup.py test + + ## Project layout + + ├─ old_files/ old scripts + ├─ references/ documents on the mathematical concepts + └─ CellRegMap/ package implementation + └─ test/ test file + + ## References + + - [Exploring Multivariate Gene-Environment Interactions: Models And Applications](https://www.repository.cam.ac.uk/handle/1810/290971) + - [Optimal tests for rare variant effects in sequencing association studies](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3440237/) [Supplementary material](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3440237/bin/supp_kxs014_kxs014supp.pdf) + +Keywords: scRNA-seq,eQTL,context-specific regulation +Platform: Windows +Platform: MacOS +Platform: Linux +Classifier: Development Status :: 5 - Production/Stable +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Classifier: Programming Language :: Python +Requires-Python: >=3.6 +Description-Content-Type: text/markdown diff --git a/cellregmap.egg-info/SOURCES.txt b/cellregmap.egg-info/SOURCES.txt new file mode 100644 index 0000000..f55bc09 --- /dev/null +++ b/cellregmap.egg-info/SOURCES.txt @@ -0,0 +1,23 @@ +LICENSE +MANIFEST.in +README.md +setup.cfg +setup.py +version.py +cellregmap/__init__.py +cellregmap/_cellregmap.py +cellregmap/_math.py +cellregmap/_simulate.py +cellregmap/_types.py +cellregmap.egg-info/PKG-INFO +cellregmap.egg-info/SOURCES.txt +cellregmap.egg-info/dependency_links.txt +cellregmap.egg-info/requires.txt +cellregmap.egg-info/top_level.txt +cellregmap.egg-info/zip-safe +cellregmap/test/__init__.py +cellregmap/test/old_test_struct_lmm2.py +cellregmap/test/test_fixed_gxe.py +cellregmap/test/test_math.py +cellregmap/test/test_simulation.py +cellregmap/test/test_struct_lmm2.py \ No newline at end of file diff --git a/cellregmap.egg-info/dependency_links.txt b/cellregmap.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cellregmap.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/cellregmap.egg-info/requires.txt b/cellregmap.egg-info/requires.txt new file mode 100644 index 0000000..6922764 --- /dev/null +++ b/cellregmap.egg-info/requires.txt @@ -0,0 +1,7 @@ +chiscore>=0.2.3 +glimix-core>=3.1.12 +numpy-sugar>=1.5.1 +numpy>=1.17.5 +pytest>=5.4.3 +scipy>=1.2.3 +tqdm>=4.41.1 diff --git a/cellregmap.egg-info/top_level.txt b/cellregmap.egg-info/top_level.txt new file mode 100644 index 0000000..5910351 --- /dev/null +++ b/cellregmap.egg-info/top_level.txt @@ -0,0 +1 @@ +cellregmap diff --git a/cellregmap.egg-info/zip-safe b/cellregmap.egg-info/zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/cellregmap.egg-info/zip-safe @@ -0,0 +1 @@ + diff --git a/cellregmap/__init__.py b/cellregmap/__init__.py deleted file mode 100644 index e8a5609..0000000 --- a/cellregmap/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -from ._cellregmap import CellRegMap, run_association, run_interaction, estimate_betas, run_association_fast -# from ._simulate import create_variances, sample_phenotype, sample_phenotype_gxe -from ._types import Term - -__version__ = "0.0.3" - -# these will be imported when adding ``from cellregmap import *`` -__all__ = [ - "__version__", - "CellRegMap", - "run_association", - "run_association_fast", - "run_interaction", - "estimate_betas", - # "lrt_pvalues", - # "sample_phenotype", - # "create_variances", - # "sample_phenotype_gxe", - "Term", -] diff --git a/cellregmap/__pycache__/__init__.cpython-38.pyc b/cellregmap/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000..aa5a544 Binary files /dev/null and b/cellregmap/__pycache__/__init__.cpython-38.pyc differ diff --git a/cellregmap/__pycache__/_cellregmap.cpython-38.pyc b/cellregmap/__pycache__/_cellregmap.cpython-38.pyc new file mode 100644 index 0000000..37b0723 Binary files /dev/null and b/cellregmap/__pycache__/_cellregmap.cpython-38.pyc differ diff --git a/cellregmap/__pycache__/_math.cpython-38.pyc b/cellregmap/__pycache__/_math.cpython-38.pyc new file mode 100644 index 0000000..a579717 Binary files /dev/null and b/cellregmap/__pycache__/_math.cpython-38.pyc differ diff --git a/cellregmap/__pycache__/_simulate.cpython-38.pyc b/cellregmap/__pycache__/_simulate.cpython-38.pyc new file mode 100644 index 0000000..7199d53 Binary files /dev/null and b/cellregmap/__pycache__/_simulate.cpython-38.pyc differ diff --git a/cellregmap/__pycache__/_types.cpython-38.pyc b/cellregmap/__pycache__/_types.cpython-38.pyc new file mode 100644 index 0000000..b68912b Binary files /dev/null and b/cellregmap/__pycache__/_types.cpython-38.pyc differ diff --git a/cellregmap/_cellregmap.py b/cellregmap/_cellregmap.py deleted file mode 100644 index 2a81bd4..0000000 --- a/cellregmap/_cellregmap.py +++ /dev/null @@ -1,682 +0,0 @@ -from typing import Optional - -from glimix_core.lmm import LMM -from numpy import ( - asarray, - atleast_1d, - atleast_2d, - concatenate, - inf, - linspace, - ones, - sqrt, - stack, -) -from numpy.linalg import cholesky -from numpy_sugar import ddot -from numpy_sugar.linalg import economic_qs_linear, economic_svd -from tqdm import tqdm - -from ._math import PMat, QSCov, ScoreStatistic - - -class CellRegMap: - """ - Mixed-model with genetic effect heterogeneity. - - The CellRegMap model can be cast as: - - 𝐲 = W𝛂 + 𝐠𝛽₁ + 𝐠⊙𝛃₂ + 𝐞 + 𝐮 + 𝛆, (1) - - where: - - 𝛃₂ ~ 𝓝(𝟎, 𝓋₃𝙴₀𝙴₀ᵀ), - 𝐞 ~ 𝓝(𝟎, 𝓋₁ρ₁𝙴₁𝙴₁ᵀ), - 𝐮 ~ 𝓝(𝟎, 𝓋₁(1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ), and - 𝛆 ~ 𝓝(𝟎, 𝓋₂𝙸). - - 𝐠⊙𝛃 is a random effect term which models the GxC effect. - Additionally, W𝛂 models additive covariates and 𝐠𝛽₁ models persistent genetic effects. - Both are modelled as fixed effects. - On the other hand, 𝐞, 𝐮 and 𝛆 are modelled as random effects - 𝐞 is the environment effect, 𝐮 is a background term accounting for interactions between population structure - and environmental structure, and 𝛆 is the iid noise. - The full covariance of 𝐲 is therefore given by: - - cov(𝐲) = 𝓋₃𝙳𝙴₀𝙴₀ᵀ𝙳 + 𝓋₁ρ₁𝙴₁𝙴₁ᵀ + 𝓋₁(1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ + 𝓋₂𝙸, - - where 𝙳 = diag(𝐠). Its marginalised form is given by: - - 𝐲 ~ 𝓝(W𝛂 + 𝐠𝛽₁, 𝓋₃𝙳𝙴₀𝙴₀ᵀ𝙳 + 𝓋₁(ρ₁𝙴₁𝙴₁ᵀ + (1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ) + 𝓋₂𝙸). - - The CellRegMap method is used to perform an interaction test: - - The interaction test compares the following hypotheses (from Eq. 1): - - 𝓗₀: 𝓋₃ = 0 - 𝓗₁: 𝓋₃ > 0 - - 𝓗₀ denotes no GxE effects, while 𝓗₁ models the presence of GxE effects. - - """ - - def __init__(self, y, E, W=None, Ls=None, E1=None, hK=None): - self._y = asarray(y, float).flatten() - self._E0 = asarray(E, float) - Ls = [] if Ls is None else Ls - - if W is not None: - self._W = asarray(W, float) - else: - self._W = ones((self._y.shape[0], 1)) - - if E1 is not None: - self._E1 = asarray(E1, float) - else: - self._E1 = asarray(E, float) - - - self._Ls = list(asarray(L, float) for L in Ls) - - assert self._W.ndim == 2 - assert self._E0.ndim == 2 - assert self._E1.ndim == 2 - - assert self._y.shape[0] == self._W.shape[0] - assert self._y.shape[0] == self._E0.shape[0] - assert self._y.shape[0] == self._E1.shape[0] - - for L in Ls: - assert self._y.shape[0] == L.shape[0] - assert L.ndim == 2 - - self._null_lmm_assoc = {} - - self._halfSigma = {} - self._Sigma_qs = {} - # TODO: remove it after debugging - self._Sigma = {} - - # option to set different background (when Ls are defined, background is K*EEt + EEt) - if len(Ls) == 0: - # self._rho0 = [1.0] - if hK is None: # EEt only as background - self._rho1 = [1.0] - self._halfSigma[1.0] = self._E1 - self._Sigma_qs[1.0] = economic_qs_linear(self._E1, return_q1=False) - else: # hK is decomposition of K, background in this case is K + EEt - self._rho1 = linspace(0, 1, 11) - for rho1 in self._rho1: - a = sqrt(rho1) - b = sqrt(1 - rho1) - hS = concatenate([a * self._E1] + [b * hK], axis=1) - self._halfSigma[rho1] = hS - self._Sigma_qs[rho1] = economic_qs_linear( - self._halfSigma[rho1], return_q1=False - ) - else: - # self._rho0 = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] - self._rho1 = linspace(0, 1, 11) - for rho1 in self._rho1: - # Σ = ρ₁𝙴𝙴ᵀ + (1-ρ₁)𝙺⊙E - # concatenate((sqrt(rho1) * self._E, sqrt(1 - rho1) * G1), axis=1) - # self._Sigma[rho1] = rho1 * self._EE + (1 - rho1) * self._K - # self._Sigma_qs[rho1] = economic_qs(self._Sigma[rho1]) - a = sqrt(rho1) - b = sqrt(1 - rho1) - hS = concatenate([a * self._E1] + [b * L for L in Ls], axis=1) - self._halfSigma[rho1] = hS - self._Sigma_qs[rho1] = economic_qs_linear( - self._halfSigma[rho1], return_q1=False - ) - - @property - def n_samples(self): - return self._y.shape[0] - - def predict_interaction(self, G, MAF): - """ - Estimate effect sizes for a given set of SNPs - """ - # breakpoint() - G = asarray(G, float) - E0 = self._E0 - W = self._W - n_snps = G.shape[1] - beta_g_s = [] - beta_gxe_s = [] - - p = asarray(atleast_1d(MAF), float) - normalization = 1 / sqrt(2 * p * (1 - p)) - - for i in range(n_snps): - g = G[:, [i]] - # mean(𝐲) = W𝛂 + 𝐠𝛽₁ + 𝙴𝝲 = 𝙼𝛃 - M = concatenate((W, g, E0), axis=1) - gE = g * E0 - best = {"lml": -inf, "rho1": 0} - hSigma_p = {} - Sigma_qs = {} - for rho1 in self._rho1: - # Σ[ρ₁] = ρ₁(𝐠⊙𝙴)(𝐠⊙𝙴)ᵀ + (1-ρ₁)𝙺⊙EEᵀ - a = sqrt(rho1) - b = sqrt(1 - rho1) - hSigma_p[rho1] = concatenate( - [a * gE] + [b * L for L in self._Ls], axis=1 - ) - # ( - # (a * gE, b * self._G), axis=1 - # ) - # cov(𝐲) = 𝓋₁Σ[ρ₁] + 𝓋₂𝙸 - # lmm = Kron2Sum(Y, [[1]], M, hSigma_p[rho1], restricted=True) - Sigma_qs[rho1] = economic_qs_linear( - hSigma_p[rho1], return_q1=False - ) - lmm = LMM(self._y, M, Sigma_qs[rho1], restricted=True) - lmm.fit(verbose=False) - - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["rho1"] = rho1 - best["lmm"] = lmm - - # breakpoint() - lmm = best["lmm"] - # beta_g = 𝛽₁ - beta_g = lmm.beta[W.shape[1]] - # yadj = 𝐲 - 𝙼𝛃 - yadj = (self._y - lmm.mean()).reshape(self._y.shape[0], 1) - rho1 = best["rho1"] - v1 = lmm.v0 - v2 = lmm.v1 - hSigma_p_qs = economic_qs_linear(hSigma_p[rho1], return_q1=False) - qscov = QSCov(hSigma_p_qs[0][0], hSigma_p_qs[1], v1, v2) - # v = cov(𝐲)⁻¹(𝐲 - 𝙼𝛃) - v = qscov.solve(yadj) - - sigma2_gxe = v1 * rho1 - beta_gxe = sigma2_gxe * E0 @ (gE.T @ v) * normalization[i] - # beta_star = (beta_g * normalization + beta_gxe) - - beta_g_s.append(beta_g) - beta_gxe_s.append(beta_gxe) - - - return (asarray(beta_g_s), stack(beta_gxe_s).T) - - def estimate_aggregate_environment(self, g): - g = atleast_2d(g).reshape((g.size, 1)) - E0 = self._E0 - gE = g * E0 - W = self._W - M = concatenate((W, g, E0), axis=1) - best = {"lml": -inf, "rho1": 0} - hSigma_p = {} - for rho1 in self._rho1: - # Σₚ = ρ₁(𝐠⊙𝙴)(𝐠⊙𝙴)ᵀ + (1-ρ₁)𝙺⊙E - a = sqrt(rho1) - b = sqrt(1 - rho1) - hSigma_p[rho1] = concatenate([a * gE] + [b * L for L in self._Ls], axis=1) - # cov(𝐲) = 𝓋₁Σₚ + 𝓋₂𝙸 - # lmm = Kron2Sum(Y, [[1]], M, hSigma_p[rho1], restricted=True) - QS = self._Sigma_qs[rho1] - lmm = LMM(self._y, M, QS, restricted=True) - lmm.fit(verbose=False) - - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["rho1"] = rho1 - best["lmm"] = lmm - - lmm = best["lmm"] - yadj = self._y - lmm.mean() - # rho1 = best["rho1"] - v1 = lmm.v0 - v2 = lmm.v1 - rho1 = best["rho1"] - sigma2_gxe = rho1 * v1 - hSigma_p_qs = economic_qs_linear(hSigma_p[rho1], return_q1=False) - qscov = QSCov(hSigma_p_qs[0][0], hSigma_p_qs[1], v1, v2) - # v = cov(𝐲)⁻¹yadj - v = qscov.solve(yadj) - beta_gxe = sigma2_gxe * gE.T @ v - - return E0 @ beta_gxe - - def scan_association(self, G): - info = {"rho1": [], "e2": [], "g2": [], "eps2": []} - - # NULL model - best = {"lml": -inf, "rho1": 0} - for rho1 in self._rho1: - QS = self._Sigma_qs[rho1] - # LRT for fixed effects requires ML rather than REML estimation - lmm = LMM(self._y, self._W, QS, restricted=False) - lmm.fit(verbose=False) - - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["rho1"] = rho1 - best["lmm"] = lmm - - null_lmm = best["lmm"] - info["rho1"].append(best["rho1"]) - info["e2"].append(null_lmm.v0 * best["rho1"]) - info["g2"].append(null_lmm.v0 * (1 - best["rho1"])) - info["eps2"].append(null_lmm.v1) - - n_snps = G.shape[1] - alt_lmls = [] - for i in tqdm(range(n_snps)): - g = G[:, [i]] - X = concatenate((self._W, g), axis=1) - QS = self._Sigma_qs[best["rho1"]] - alt_lmm = LMM(self._y, X, QS, restricted=False) - alt_lmm.fit(verbose=False) - alt_lmls.append(alt_lmm.lml()) - - pvalues = lrt_pvalues(null_lmm.lml(), alt_lmls, dof=1) - - info = {key: asarray(v, float) for key, v in info.items()} - return asarray(pvalues, float), info - - - def scan_association_fast(self, G): - info = {"rho1": [], "e2": [], "g2": [], "eps2": []} - - # NULL model - best = {"lml": -inf, "rho1": 0} - for rho1 in self._rho1: - QS = self._Sigma_qs[rho1] - # LRT for fixed effects requires ML rather than REML estimation - lmm = LMM(self._y, self._W, QS, restricted=False) - lmm.fit(verbose=False) - - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["rho1"] = rho1 - best["lmm"] = lmm - - null_lmm = best["lmm"] - info["rho1"].append(best["rho1"]) - info["e2"].append(null_lmm.v0 * best["rho1"]) - info["g2"].append(null_lmm.v0 * (1 - best["rho1"])) - info["eps2"].append(null_lmm.v1) - - # Alternative model - lmm = null_lmm - flmm = lmm.get_fast_scanner() - alt_lmls = flmm.fast_scan(G, verbose=False)['lml'] - - pvalues = lrt_pvalues(null_lmm.lml(), alt_lmls, dof=1) - - info = {key: asarray(v, float) for key, v in info.items()} - return asarray(pvalues, float), info - - - def scan_interaction( - self, G, idx_E: Optional[any] = None, idx_G: Optional[any] = None - ): - """ - 𝐲 = W𝛂 + 𝐠𝛽₁ + 𝐠⊙𝛃₂ + 𝐞 + 𝐮 + 𝛆 - [fixed=X] [H1] - - 𝛃₂ ~ 𝓝(𝟎, 𝓋₃𝙴₀𝙴₀ᵀ), - 𝐞 ~ 𝓝(𝟎, 𝓋₁ρ₁𝙴₁𝙴₁ᵀ), - 𝐮 ~ 𝓝(𝟎, 𝓋₁(1-ρ₁)𝙺⊙𝙴₂𝙴₂ᵀ), and - 𝛆 ~ 𝓝(𝟎, 𝓋₂𝙸). - - 𝓗₀: 𝓋₃ = 0 - 𝓗₁: 𝓋₃ > 0 - """ - # TODO: make sure G is nxp - from chiscore import davies_pvalue - - G = asarray(G, float) - n_snps = G.shape[1] - pvalues = [] - info = {"rho1": [], "e2": [], "g2": [], "eps2": []} - - for i in tqdm(range(n_snps)): - g = G[:, [i]] - X = concatenate((self._W, g), axis=1) - best = {"lml": -inf, "rho1": 0} - # Null model fitting: find best (𝛂, 𝛽₁, 𝓋₁, 𝓋₂, ρ₁) - for rho1 in self._rho1: - # QS = self._Sigma_qs[rho1] - # halfSigma = self._halfSigma[rho1] - # Σ = ρ₁𝙴𝙴ᵀ + (1-ρ₁)𝙺⊙E - # cov(y₀) = 𝓋₁Σ + 𝓋₂I - QS = self._Sigma_qs[rho1] - lmm = LMM(self._y, X, QS, restricted=True) - lmm.fit(verbose=False) - - if lmm.lml() > best["lml"]: - best["lml"] = lmm.lml() - best["rho1"] = rho1 - best["lmm"] = lmm - - lmm = best["lmm"] - # H1 via score test - # Let K₀ = e²𝙴𝙴ᵀ + g²𝙺⊙E + 𝜀²I - # e²=𝓋₁ρ₁ - # g²=𝓋₁(1-ρ₁) - # 𝜀²=𝓋₂ - # with optimal values 𝓋₁ and 𝓋₂ found above. - info["rho1"].append(best["rho1"]) - info["e2"].append(lmm.v0 * best["rho1"]) - info["g2"].append(lmm.v0 * (1 - best["rho1"])) - info["eps2"].append(lmm.v1) - # QS = economic_decomp( Σ(ρ₁) ) - Q0 = self._Sigma_qs[best["rho1"]][0][0] - S0 = self._Sigma_qs[best["rho1"]][1] - # e2 = best["lmm"].v0 * best["rho1"] - # g2 = best["lmm"].v0 * (1 - best["rho1"]) - # eps2 = best["lmm"].v1 - # EE = self._E @ self._E.T - # K = self._G @ self._G.T - # K0 = e2 * EE + g2 * K + eps2 * eye(K.shape[0]) - qscov = QSCov( - Q0, - S0, - lmm.v0, # 𝓋₁ - lmm.v1, # 𝓋₂ - ) - # start = time() - # qscov = QSCov(self._Sigma_qs[best["rho1"]], lmm.C0[0, 0], lmm.C1[0, 0]) - # print(f"Elapsed: {time() - start}") - # X = concatenate((self._E, g), axis=1) - # X = concatenate((self._W, g), axis=1) - - # Let P₀ = K₀⁻¹ - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹. - P = PMat(qscov, X) - # P0 = inv(K0) - inv(K0) @ X @ inv(X.T @ inv(K0) @ X) @ X.T @ inv(K0) - - # P₀𝐲 = K₀⁻¹𝐲 - K₀⁻¹X(XᵀK₀⁻¹X)⁻¹XᵀK₀⁻¹𝐲. - - # Useful for permutation - if idx_E is None: - E0 = self._E0 - else: - E0 = self._E0[idx_E, :] - - # The covariance matrix of H1 is K = K₀ + 𝓋₃diag(𝐠)⋅𝙴𝙴ᵀ⋅diag(𝐠) - # We have ∂K/∂𝓋₃ = diag(𝐠)⋅𝙴𝙴ᵀ⋅diag(𝐠) - # The score test statistics is given by - # Q = ½𝐲ᵀP₀⋅∂K⋅P₀𝐲 - # start = time() - - # Useful for permutation - if idx_G is None: - gtest = g.ravel() - else: - gtest = g.ravel()[idx_G] - - ss = ScoreStatistic(P, qscov, ddot(gtest, E0)) - Q = ss.statistic(self._y) - # import numpy as np - - # deltaK = np.diag(gtest) @ EE @ np.diag(gtest) - # Q_ = 0.5 * self._y.T @ P0 @ deltaK @ P0 @ self._y - # print(f"Elapsed: {time() - start}") - # Q is the score statistic for our interaction test and follows a linear - # combination - # of chi-squared (df=1) distributions: - # Q ∼ ∑λχ², where λᵢ are the non-zero eigenvalues of ½√P₀⋅∂K⋅√P₀. - # Since eigenvals(𝙰𝙰ᵀ) = eigenvals(𝙰ᵀ𝙰) (TODO: find citation), - # we can compute ½(√∂K)P₀(√∂K) instead. - # start = time() - # import scipy as sp - # sqrtm = sp.linalg.sqrtm - # np.linalg.eigvalsh(0.5 * sqrtm(P0) @ deltaK @ sqrtm(P0)) - # np.linalg.eigvalsh(0.5 * sqrtm(deltaK) @ P0 @ sqrtm(deltaK)) - # TODO: compare with Liu approximation, maybe try a computational intensive - # method - pval, pinfo = davies_pvalue(Q, ss.matrix_for_dist_weights(), True) - pvalues.append(pval) - # print(f"Elapsed: {time() - start}") - - info = {key: asarray(v, float) for key, v in info.items()} - return asarray(pvalues, float), info - - -def lrt_pvalues(null_lml, alt_lmls, dof=1): - """ - Compute p-values from likelihood ratios. - - These are likelihood ratio test p-values. - - Parameters - ---------- - null_lml : float - Log of the marginal likelihood under the null hypothesis. - alt_lmls : array_like - Log of the marginal likelihoods under the alternative hypotheses. - dof : int - Degrees of freedom. - - Returns - ------- - pvalues : ndarray - P-values. - """ - from numpy import clip - from numpy_sugar import epsilon - from scipy.stats import chi2 - - lrs = clip(-2 * null_lml + 2 * asarray(alt_lmls, float), epsilon.super_tiny, inf) - pv = chi2(df=dof).sf(lrs) - return clip(pv, epsilon.super_tiny, 1 - epsilon.tiny) - -def run_association(y, W, E, G, hK=None): - """ - Association test. - - Test for persistent genetic effects. - - Compute p-values using a likelihood ratio test. - - Parameters - ---------- - y : array - Phenotype - W : array - Fixed effect covariates - E : array - Cellular contexts - G : array - Genotypes (expanded) - hK : array - decompositon of kinship matrix (expanded) - - Returns - ------- - pvalues : ndarray - P-values. - """ - if hK is None: hK = None - crm = CellRegMap(y, W, E, hK=hK) - pv = crm.scan_association(G) - return pv - -def run_association_fast(y, W, E, G, hK=None): - """ - Association test. - - Test for persistent genetic effects. - - Compute p-values using a likelihood ratio test. - - Parameters - ---------- - y : array - Phenotype - W : array - Fixed effect covariates - E : array - Cellular contexts - G : array - Genotypes (expanded) - hK : array - decompositon of kinship matrix (expanded) - - Returns - ------- - pvalues : ndarray - P-values. - """ - if hK is None: hK = None - crm = CellRegMap(y, W, E, hK=hK) - pv = crm.scan_association_fast(G) - return pv - -def get_L_values(hK, E): - """ - As the definition of Ls is not particulatly intuitive, - function to extract list of L values given kinship K and - cellular environments E - """ - # get eigendecomposition of EEt - [U, S, _] = economic_svd(E) - us = U * S - - # get decomposition of K \odot EEt - Ls = [ddot(us[:,i], hK) for i in range(us.shape[1])] - return Ls - -def run_interaction(y, E, G, W=None, E1=None, E2=None, hK=None, idx_G=None): - """ - Interaction test. - - Test for cell-level genetic effects due to GxC interactions. - - Compute p-values using a score test. - - Parameters - ---------- - y : array - Phenotype - E : array - Cellular contexts (GxC component) - G : array - Genotypes (expanded) - W : array - Fixed effect covariates - hK : array - decompositon of kinship matrix (expanded) - E1 : array - Cellular contexts (C component) - E2 : array - Cellular contexts (K*C component) - idx_G : array - Permuted genotype index - - Returns - ------- - pvalues : ndarray - P-values. - """ - if E1 is None: E1 = E - else: E1 = E1 - if E2 is None: E2 = E - else: E2 = E2 - if hK is None: Ls = None - else: Ls = get_L_values(hK, E2) - crm = CellRegMap(y=y, E=E, W=W, E1=E1, Ls=Ls) - pv = crm.scan_interaction(G, idx_G) - return pv - -def compute_maf(X): - r"""Compute minor allele frequencies. - It assumes that ``X`` encodes 0, 1, and 2 representing the number - of alleles (or dosage), or ``NaN`` to represent missing values. - Parameters - ---------- - X : array_like - Genotype matrix. - Returns - ------- - array_like - Minor allele frequencies. - Examples - -------- - .. doctest:: - >>> from numpy.random import RandomState - >>> from limix.qc import compute_maf - >>> - >>> random = RandomState(0) - >>> X = random.randint(0, 3, size=(100, 10)) - >>> - >>> print(compute_maf(X)) # doctest: +FLOAT_CMP - [0.49 0.49 0.445 0.495 0.5 0.45 0.48 0.48 0.47 0.435] - """ - import dask.array as da - import xarray as xr - from pandas import DataFrame - from numpy import isnan, logical_not, minimum, nansum - if isinstance(X, da.Array): - s0 = da.nansum(X, axis=0).compute() - denom = 2 * (X.shape[0] - da.isnan(X).sum(axis=0)).compute() - elif isinstance(X, DataFrame): - s0 = X.sum(axis=0, skipna=True) - denom = 2 * logical_not(X.isna()).sum(axis=0) - elif isinstance(X, xr.DataArray): - if "sample" in X.dims: - kwargs = {"dim": "sample"} - else: - kwargs = {"axis": 0} - s0 = X.sum(skipna=True, **kwargs) - denom = 2 * logical_not(isnan(X)).sum(**kwargs) - else: - s0 = nansum(X, axis=0) - denom = 2 * logical_not(isnan(X)).sum(axis=0) - s0 = s0 / denom - s1 = 1 - s0 - maf = minimum(s0, s1) - if hasattr(maf, "name"): - maf.name = "maf" - return maf - -def estimate_betas(y, W, E, G, maf=None, E1=None, E2=None, hK=None): - """ - Effect sizes estimator - - Estimates cell-level genetic effects due to GxC - as well as persistent genetic effects across all cells. - - Parameters - ---------- - y : array - Phenotype - W : array - Fixed effect covariates - E : array - Cellular contexts - G : array - Genotypes (expanded) - maf: array - Minor allele frequencies (MAFs) for the SNPs in G - hK : array - decompositon of kinship matrix (expanded) - E1 : array - Cellular contexts (C component) - E2 : array - Cellular contexts (K*C component) - - Returns - ------- - betas : ndarray - estimated effect sizes, both persistent and due to GxC. - """ - if E1 is None: E1 = E - else: E1 = E1 - if E2 is None: E2 = E - else: E2 = E2 - if hK is None: Ls = None - else: Ls = get_L_values(hK, E2) - crm = CellRegMap(y=y, E=E, W=W, E1=E1, Ls=Ls) - if maf is None: - maf = compute_maf(G) - # print("MAFs: {}".format(maf)) - betas = crm.predict_interaction(G, maf) - return betas diff --git a/cellregmap/_math.py b/cellregmap/_math.py deleted file mode 100644 index fc56572..0000000 --- a/cellregmap/_math.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -Mathematical definitions ------------------------- - -This module defines some core mathematical concepts which CellRegMap depends on. -The implementations provided here are meant to help understand those concepts and to provide test cases. - -We assume the definition - - 𝐲 ∼ 𝓝(𝚆, 𝙺) - -throughout this module. - -References ----------- -.. [1] Lippert, C., Xiang, J., Horta, D., Widmer, C., Kadie, C., Heckerman, D., - & Listgarten, J. (2014). Greater power and computational efficiency for - kernel-based association testing of sets of genetic variants. Bioinformatics, - 30(22), 3206-3214. -.. [2] Liu, H., Tang, Y., & Zhang, H. H. (2009). A new chi-square approximation - to the distribution of non-negative definite quadratic forms in non-central - normal variables. Computational Statistics & Data Analysis, 53(4), 853-856. -.. [3] Lee, Seunggeun, Michael C. Wu, and Xihong Lin. "Optimal tests for rare - variant effects in sequencing association studies." Biostatistics 13.4 (2012): - 762-775. -""" -from numpy import finfo, logical_not, sqrt -from numpy.linalg import eigh, eigvalsh, inv, lstsq, solve, svd -from numpy_sugar import ddot -from scipy.linalg import sqrtm - - -def rsolve(a, b): - """ - Robust solver. - """ - return lstsq(a, b, rcond=None)[0] - - -class QSCov: - """ - Represents 𝑎𝙺 + 𝑏𝙸. - - 𝙺 matrix is defined by its eigen decomposition, `QS`. - """ - - def __init__(self, Q0, S0, a=1.0, b=1.0): - self._Q0 = Q0 - self._S0 = S0 - self._a = a - self._b = b - - def dot(self, v): - left = self._a * self._Q0 @ ddot(self._S0, self._Q0.T @ v, left=True) - right = self._b * v - return left + right - - def solve(self, v): - # nrows = self._Q0.shape[0] - # rank = self._Q0.shape[1] - - # tmp = ones(nrows) - # tmp[:rank] += (self._a / self._b) * self._S0 - # R = 1 / tmp - - # R0 = R[:rank] - # R1 = R[rank:] # Always ones - - # tmp = (self._a / self._b) * self._S0 - R0 = 1 / (1 + ((self._a / self._b) * self._S0)) - # R0 = R[:rank] - Q0v = self._Q0.T @ v - return (self._Q0 @ ddot(R0, Q0v, left=True) + v - self._Q0 @ Q0v) / self._b - # left = self._Q0 @ ddot(R0, self._Q0.T @ v, left=True) - # right = self._Q1 @ ddot(R1, self._Q1.T @ v, left=True) - # return (left + right) / self._b - - -class PMat: - """ - Represents 𝙿 = 𝙺⁻¹ - 𝙺⁻¹𝚆(𝚆ᵀ𝙺⁻¹𝚆)⁻¹𝚆ᵀ𝙺⁻¹. - - The 𝙺 is defined via an `QSCov` object. - """ - - def __init__(self, qscov: QSCov, W): - self._qscov = qscov - self._W = W - self._KiW = self._qscov.solve(self._W) - - def dot(self, v): - Kiv = self._qscov.solve(v) - return Kiv - self._KiW @ rsolve(self._W.T @ self._KiW, self._KiW.T @ v) - - -def P_matrix(W, K): - """ Computes 𝙿 = 𝙺⁻¹ - 𝙺⁻¹𝚆(𝚆ᵀ𝙺⁻¹𝚆)⁻¹𝚆ᵀ𝙺⁻¹. """ - KiW = solve(K, W) - return inv(K) - KiW @ solve(W.T @ KiW, KiW.T) - - -class ScoreStatistic: - """ - Score-test statistic [1]_ is given by - - 𝑄 = ½𝐲ᵀ𝙿(∂𝙺)𝙿𝐲. - """ - - def __init__(self, P: PMat, K: QSCov, sqrt_dK): - self._P = P - self._K = K - self._sqrt_dK = sqrt_dK - - def statistic(self, y): - """ Compute 𝑄 = ½𝐲ᵀ𝙿(∂𝙺)𝙿𝐲. """ - Py = self._P.dot(y) - return Py.T @ self._sqrt_dK @ self._sqrt_dK.T @ Py / 2 - - def matrix_for_dist_weights(self): - """Compute ½(√∂𝙺)𝙿(√∂𝙺). - - The returned matrix has its eigenvalues equal to the eigenvalues of ½√𝙿(∂𝙺)√𝙿. - """ - return self._sqrt_dK.T @ self._P.dot(self._sqrt_dK) / 2 - - def distr_weights(self): - weights = eigvalsh(self.matrix_for_dist_weights()) - return weights[weights > 1e-16] - - -def score_statistic(y, W, K, dK): - """ - Score-test statistic [1]_ is given by - - 𝑄 = ½𝐲ᵀ𝙿(∂𝙺)𝙿𝐲. - """ - P = P_matrix(W, K) - return y.T @ P @ dK @ P @ y / 2 - - -def score_statistic_qs(y, W, qscov, dK): - """ - Same as `score_statistic`. - """ - P = PMat(qscov, W) - Py = P.dot(y) - return Py.T @ dK @ Py / 2 - - -def score_statistic_distr_weights(W, K, dK): - """ - Score-test statistic follows a weighted sum of random variables [1]_: - - 𝑄 ∼ ∑ᵢ𝜆ᵢχ²(1), - - where 𝜆ᵢ are the non-zero eigenvalues of ½√𝙿(∂𝙺)√𝙿. - """ - P = P_matrix(W, K) - weights = eigvalsh(sqrtm(P) @ dK @ sqrtm(P)) / 2 - return weights[weights > 1e-16] - - -def score_statistic_liu_params(q, weights): - """ - Computes Pr(𝑄 > q) for 𝑄 ∼ ∑ᵢ𝜆ᵢχ²(1) using a modification [3]_ of the original Liu - survival function approximation [2]_. This function also returns estimated - parameters, not yet fully explained here. - """ - from chiscore import liu_sf - - n = len(weights) - # We use the Liu survival function to approximate the distribution followed by a - # linear combination of noncentral chi-squared variables (Q) using only three - # parameters of such distribution: the weights, degrees of freedom, and - # noncentrality (Qh). 𝑄 ∼ ∑λᵢχ²(hᵢ, 𝛿ᵢ), - # where λᵢ, hᵢ, and 𝛿ᵢ are the weights, degrees of freedom (1), and noncentral - # (0) parameters. By setting the last input to True we use the modified version - # [REF]. - (pv, dof_x, _, info) = liu_sf(q, weights, [1] * n, [0] * n, True) - return {"pv": pv, "mu_q": info["mu_q"], "sigma_q": info["sigma_q"], "dof_x": dof_x} - - -def qmin(liu_params): - import scipy.stats as st - from numpy import zeros - - n = len(liu_params) - - # T statistic - T = min(i["pv"] for i in liu_params) - - qmin = zeros(n) - percentile = 1 - T - for i in range(n): - q = st.chi2.ppf(percentile, liu_params[i]["dof_x"]) - mu_q = liu_params[i]["mu_q"] - sigma_q = liu_params[i]["sigma_q"] - dof = liu_params[i]["dof_x"] - qmin[i] = (q - dof) / (2 * dof) ** 0.5 * sigma_q + mu_q - - return qmin - - -def economic_qs(K, epsilon=sqrt(finfo(float).eps)): - r"""Economic eigen decomposition for symmetric matrices. - - A symmetric matrix ``K`` can be decomposed in - :math:`\mathrm Q_0 \mathrm S_0 \mathrm Q_0^\intercal + \mathrm Q_1\ - \mathrm S_1 \mathrm Q_1^ \intercal`, where :math:`\mathrm S_1` is a zero - matrix with size determined by ``K``'s rank deficiency. - - Args: - K (array_like): Symmetric matrix. - epsilon (float): Eigen value threshold. Default is - ``sqrt(finfo(float).eps)``. - - Returns: - tuple: ``((Q0, Q1), S0)``. - """ - - (S, Q) = eigh(K) - - nok = abs(max(Q[0].min(), Q[0].max(), key=abs)) < epsilon - nok = nok and abs(max(K.min(), K.max(), key=abs)) >= epsilon - if nok: - from scipy.linalg import eigh as sp_eigh - - (S, Q) = sp_eigh(K) - - ok = S >= epsilon - nok = logical_not(ok) - S0 = S[ok] - Q0 = Q[:, ok] - Q1 = Q[:, nok] - return ((Q0, Q1), S0) - - -def economic_qs_linear(G): - r"""Economic eigen decomposition for symmetric matrices ``dot(G, G.T)``. - - It is theoretically equivalent to ``economic_qs(dot(G, G.T))``. - Refer to :func:`numpy_sugar.economic_qs` for further information. - - Args: - G (array_like): Matrix. - - Returns: - tuple: ``((Q0, Q1), S0)``. - """ - if G.shape[0] > G.shape[1]: - (Q0, Ssq, _) = svd(G, full_matrices=False) - S0 = Ssq ** 2 - return (Q0, S0) - - Q, S0 = economic_qs(G.dot(G.T)) - return (Q[0], S0) diff --git a/cellregmap/_simulate.py b/cellregmap/_simulate.py deleted file mode 100644 index 0665a49..0000000 --- a/cellregmap/_simulate.py +++ /dev/null @@ -1,479 +0,0 @@ -from collections import namedtuple -from typing import List, Union - -from numpy import ( - array_split, - asarray, - cumsum, - errstate, - eye, - isscalar, - ones, - repeat, - split, - sqrt, - stack, - zeros, -) -from numpy.random import Generator -from numpy_sugar import ddot -from numpy_sugar.linalg import economic_svd - -from ._types import Term - -Variances = namedtuple("Variances", "g gxe k e n") -Simulation = namedtuple( - "Simulation", "mafs y offset beta_g y_g y_gxe y_k y_e y_n variances G E Lk Ls K M" -) -SimulationFixedGxE = namedtuple( - "Simulation", - "mafs y offset beta_g beta_gxe beta_e y_g y_gxe y_k y_e y_n variances G E Lk K M", -) - - -def sample_maf(n_snps: int, maf_min: float, maf_max: float, random: Generator): - assert maf_min <= maf_max and maf_min >= 0 and maf_max <= 1 - return random.random(n_snps) * (maf_max - maf_min) + maf_min - - -def sample_genotype(n_samples: int, mafs, random): - G = [] - mafs = asarray(mafs, float) - for maf in mafs: - probs = [(1 - maf) ** 2, 1 - ((1 - maf) ** 2 + maf ** 2), maf ** 2] - g = random.choice([0, 1, 2], p=probs, size=n_samples) - G.append(asarray(g, float)) - - return stack(G, axis=1) - - -def column_normalize(X): - X = asarray(X, float) - - with errstate(divide="raise", invalid="raise"): - return (X - X.mean(0)) / X.std(0) - - -def create_environment_matrix( - n_samples: int, n_env: int, groups: List[List[int]], random: Generator -): - E = random.normal(size=[n_samples, n_env]) - E = column_normalize(E) - EE = E @ E.T - EE /= EE.diagonal().mean() - H = sample_covariance_matrix(n_samples, groups)[1] - M = EE + H - M /= M.diagonal().mean() - jitter(M) - return _symmetric_decomp(M) - - -def create_environment_vector( - n_samples: int, groups: List[List[int]], random: Generator -): - E = zeros((n_samples, 1)) - - values = random.choice([-1, 1], 2, False) - for value, group in zip(values, groups): - E[group, 0] = value - - return E - - -def sample_covariance_matrix(n_samples: int, groups: List[List[int]]): - X = zeros((n_samples, len(groups))) - - for i, idx in enumerate(groups): - X[idx, i] = 1.0 - - K = X @ X.T - K /= K.diagonal().mean() - jitter(K) - - return (_symmetric_decomp(K), K) - - -def jitter(K): - with errstate(divide="raise", invalid="raise"): - # This small diagonal offset is to guarantee the full-rankness. - K += 1e-8 * eye(K.shape[0]) - - return K - - -def create_variances(r0, v0, has_kinship=True) -> Variances: - """ - Remember that: - - cov(𝐲) = 𝓋₀(1-ρ₀)𝙳𝟏𝟏ᵀ𝙳 + 𝓋₀ρ₀𝙳𝙴𝙴ᵀ𝙳 + 𝓋₁ρ₁EEᵀ + 𝓋₁(1-ρ₁)𝙺 + 𝓋₂𝙸. - - Let us define: - - σ²_g = 𝓋₀(1-ρ₀) (variance explained by persistent genetic effects) - σ²_gxe = 𝓋₀ρ₀ (variance explained by GxE effects) - - σ²_e = 𝓋₁ρ₁ (variance explained by environmental effects) - σ²_k = 𝓋₁(1-ρ₁) (variance explained by population structure) - σ²_n = 𝓋₂ (residual variance, noise) - - We set the total variance to sum up to 1: - - 1 = σ²_g + σ²_gxe + σ²_e + σ²_k + σ²_n - - We set the variances explained by the non-genetic terms to be equal: - - v = σ²_e = σ²_k = σ²_n - - For `has_kinship=False`, we instead set the variances such that: - - v = σ²_e = σ²_n - - Parameters - ---------- - r0 : float - This is ρ₀. - v0 : float - This is 𝓋₀. - """ - v_g = v0 * (1 - r0) - v_gxe = v0 * r0 - - v_k = 0.0 - if has_kinship: - v = (1 - v_gxe - v_g) / 3 - v_e = v - v_k = v - v_n = v - else: - v = (1 - v_gxe - v_g) / 2 - v_e = v - v_n = v - - variances = {"g": v_g, "gxe": v_gxe, "e": v_e, "n": v_n} - if has_kinship: - variances["k"] = v_k - else: - variances["k"] = None - - return Variances(**variances) - - -def sample_persistent_effsizes( - n_effects: int, causal_indices: list, variance: float, random: Generator -): - """ - Let 𝚓 denote a sample index and 𝚔 denote a SNP index. Let 𝚟ⱼ = 𝐠ⱼᵀ𝛃. - We assume that 𝑔ⱼₖ is a random variable such that: - - 𝔼[𝑔ⱼₖ] = 0 - 𝔼[𝑔ⱼₖ²] = 1 - - And we also assume that SNPs are uncorrelated from each other: 𝔼[𝑔ⱼₖ⋅𝑔ⱼᵣ] = 0 - for 𝚔≠𝚛. - Assuming that 𝛃 is given (fixed), we want to simulate 𝛃 such that: - - 𝔼[𝚟ⱼ] = 𝔼[∑ₖ𝑔ⱼₖ𝛽ₖ] = ∑ₖ𝔼[𝑔ⱼₖ]𝛽ₖ = 0 - 𝔼[𝚟ⱼ²] = 𝔼[(∑ₖ𝑔ⱼₖ𝛽ₖ)²] = ∑ₖ𝔼[𝑔ⱼₖ²]𝛽ₖ² = ∑ₖ𝛽ₖ² = 𝓋. - - Let 𝚒 denote a causal index. We initialize 𝛃←𝟎 and then randomly set 𝛽ᵢϵ{-1,+1} for - the causal SNPs. At last, we set 𝛃←𝛃×√(𝓋/𝘯) where 𝘯 is the number of causal SNPs. - This way we have ∑ₖ𝛽ₖ² = 𝓋. - - Parameters - ---------- - n_effects : int - Number of effects. - causal_indices : list - List of causal SNPs. - variance : float - Correspond to 𝓋. - """ - n_causals = len(causal_indices) - - effsizes = zeros(n_effects) - if variance == 0.0: - return effsizes - - effsizes[causal_indices] = random.choice([+1, -1], size=len(causal_indices)) - with errstate(divide="raise", invalid="raise"): - effsizes *= sqrt(variance / n_causals) - - return effsizes - - -def sample_persistent_effects(X, effsizes, variance: float): - y_g = X @ effsizes - if variance > 0: - _ensure_moments(y_g, 0, variance) - return y_g - - -def sample_gxe_effects(G, E, causal_indices: list, variance: float, random: Generator): - """ - Let 𝚒 denote a SNP index and 𝚓 denote an environment. - Let 𝑦₂ = ∑ᵢ(𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ) be the total GxE effect with - - 𝜶ᵢ ∼ 𝓝(𝟎, 𝜎ᵢ²I) - - for every SNP ᵢ. - We have - - 𝔼[𝑦₂] = ∑ᵢ𝔼[𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ] = ∑ᵢ𝔼[𝑔ᵢ]𝔼[𝛜ᵀ𝜶ᵢ] = ∑ᵢ0⋅𝔼[𝛜ᵀ𝜶ᵢ] = 0, - - where 𝑔ᵢ and 𝛜ᵀ𝜶ᵢ are assumed to be uncorrelated. - - We also have - - 𝔼[𝑦₂²] = 𝔼[(∑ᵢ𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ)²] = ∑ᵢ∑ⱼ𝔼[𝜖ⱼ²]𝔼[𝛼ᵢⱼ²] = ∑ᵢ𝜎ᵢ² = 𝜎², - - after a couple of assumptions. - - We define 𝜎ᵢ²=𝑣ᵢ if 𝑔ᵢ is causal and 𝜎ᵢ²=0 otherwise. We assume all causal SNPs - to have equal effect as defined by 𝑣ᵢ=𝜎²/𝑛₂, where 𝑛₂ is the number of SNPs - having GxE effects. - - We also assume that 𝔼[𝜖ⱼ]=0 and 𝔼[𝜖ⱼ²]=1/𝑛ₑ for every environment 𝚓. - """ - n_samples = G.shape[0] - n_envs = E.shape[1] - n_causals = len(causal_indices) - - y2 = zeros(n_samples) - if variance == 0.0: - return y2 - - vi = variance / n_causals - for causal in causal_indices: - # 𝜶ᵢ ∼ 𝓝(𝟎, 𝜎ᵢ²I) - alpha = sqrt(vi) * random.normal(size=n_envs) - - # Make the sample statistics close to population - # statistics - if n_envs > 1: - _ensure_moments(alpha, 0, sqrt(vi)) - - # 𝜷 = 𝛜ᵀ𝜶ᵢ - beta = E @ alpha - - # 𝑔ᵢ⋅𝛜ᵀ𝜶ᵢ - y2 += G[:, causal] * beta - - _ensure_moments(y2, 0, variance) - - return y2 - - -# def sample_environment_effects(E, variance: float, random): -# from numpy import sqrt - -# n_envs = E.shape[1] -# effsizes = sqrt(variance) * random.randn(n_envs) -# y3 = E @ effsizes - -# _ensure_moments(y3, 0, variance) - -# return y3 - - -# def sample_random_effect(K, variance: float, random: Generator): -# y = random.multivariate_normal(zeros(K.shape[0]), K, method="cholesky") - -# _ensure_moments(y, 0, variance) - -# return y - -def _sample_random_effect(X, variance: float, random: Generator): - u = sqrt(variance) * random.normal(size=X.shape[1]) - y = X @ u - - _ensure_moments(y, 0, variance) - - return y - - -def sample_random_effect(X, variance: float, random: Generator): - if not isinstance(X, tuple): - return _sample_random_effect(X, variance, random) - - n = X[0].shape[0] - y = zeros(n) - for L in X: - u = sqrt(variance) * random.normal(size=L.shape[1]) - y += L @ u - _ensure_moments(y, 0, variance) - - return y - - -def sample_noise_effects(n_samples: int, variance: float, random: Generator): - y5 = sqrt(variance) * random.normal(size=n_samples) - _ensure_moments(y5, 0, variance) - - return y5 - - -def sample_phenotype_gxe( - offset: float, - n_individuals: int, - n_snps: int, - n_cells: Union[int, List[int]], - n_env_groups: int, - maf_min: float, - maf_max: float, - g_causals: list, - gxe_causals: list, - variances: Variances, - random: Generator, - env_term: Term = Term.RANDOM, -) -> Simulation: - """ - Parameters - ---------- - n_cells - Integer number of array of integers. - """ - mafs = sample_maf(n_snps, maf_min, maf_max, random) - - G = sample_genotype(n_individuals, mafs, random) - G = repeat(G, n_cells, axis=0) - G = column_normalize(G) - - n_samples = G.shape[0] - - if isscalar(n_cells): - individual_groups = array_split(range(n_samples), n_individuals) - else: - individual_groups = split(range(n_samples), cumsum(n_cells))[:-1] - - env_groups = array_split(random.permutation(range(n_samples)), n_env_groups) - - E = sample_covariance_matrix(n_samples, env_groups)[0] - - Lk, K = sample_covariance_matrix(n_samples, individual_groups) - [U, S, _] = economic_svd(E) - us = U * S - Ls = tuple([ddot(us[:, i], Lk) for i in range(us.shape[1])]) - - beta_g = sample_persistent_effsizes(n_snps, g_causals, variances.g, random) - y_g = sample_persistent_effects(G, beta_g, variances.g) - - y_gxe = sample_gxe_effects(G, E, gxe_causals, variances.gxe, random) - - y_k = sample_random_effect(Ls, variances.k, random) - - if env_term is Term.RANDOM: - y_e = sample_random_effect(E, variances.e, random) - elif env_term is Term.FIXED: - n = E.shape[1] - beta_e = sample_persistent_effsizes(n, list(range(n)), variances.e, random) - y_e = sample_persistent_effects(E, beta_e, variances.e) - else: - raise ValueError("Invalid term.") - - y_n = sample_noise_effects(n_samples, variances.n, random) - - M = ones((K.shape[0], 1)) - y = offset + y_g + y_gxe + y_k + y_e + y_n - - simulation = Simulation( - mafs=mafs, - offset=offset, - beta_g=beta_g, - y_g=y_g, - y_gxe=y_gxe, - y_k=y_k, - y_e=y_e, - y_n=y_n, - y=y, - variances=variances, - Lk=Lk, - Ls=Ls, - K=K, - E=E, - G=G, - M=M, - ) - - return simulation - - -def sample_phenotype( - offset: float, - n_individuals: int, - n_snps: int, - n_cells: Union[int, List[int]], - n_env: int, - n_env_groups: int, - maf_min: float, - maf_max: float, - g_causals: list, - gxe_causals: list, - variances: Variances, - random: Generator, -) -> Simulation: - """ - Parameters - ---------- - n_cells - Integer number of array of integers. - """ - mafs = sample_maf(n_snps, maf_min, maf_max, random) - - G = sample_genotype(n_individuals, mafs, random) - G = repeat(G, n_cells, axis=0) - G = column_normalize(G) - - n_samples = G.shape[0] - individual_groups = array_split(range(n_samples), n_individuals) - - env_groups = array_split(random.permutation(range(n_samples)), n_env_groups) - E = create_environment_matrix(n_samples, n_env, env_groups, random) - - Lk, K = sample_covariance_matrix(n_samples, individual_groups) - - beta_g = sample_persistent_effsizes(n_snps, g_causals, variances.g, random) - - y_g = sample_persistent_effects(G, beta_g, variances.g) - - y_gxe = sample_gxe_effects(G, E, gxe_causals, variances.gxe, random) - - y_k = sample_random_effect(Lk, variances.k, random) - - y_e = sample_random_effect(E, variances.e, random) - - y_n = sample_noise_effects(n_samples, variances.n, random) - - M = ones((K.shape[0], 1)) - y = offset + y_g + y_gxe + y_k + y_e + y_n - - simulation = Simulation( - mafs=mafs, - offset=offset, - beta_g=beta_g, - y_g=y_g, - y_gxe=y_gxe, - y_k=y_k, - y_e=y_e, - y_n=y_n, - y=y, - variances=variances, - Lk=Lk, - K=K, - E=E, - G=G, - M=M, - ) - - return simulation - - -def _ensure_moments(arr, mean: float, variance: float): - arr -= arr.mean(0) + mean - with errstate(divide="raise", invalid="raise"): - arr /= arr.std(0) - arr *= sqrt(variance) - - -def _symmetric_decomp(H): - [U, S, _] = economic_svd(H) - return ddot(U, sqrt(S)) diff --git a/cellregmap/_types.py b/cellregmap/_types.py deleted file mode 100644 index 895923a..0000000 --- a/cellregmap/_types.py +++ /dev/null @@ -1,8 +0,0 @@ -from enum import Enum - -__all__ = ["Term"] - - -class Term(Enum): - FIXED = 1 - RANDOM = 2 diff --git a/cellregmap/test/__init__.py b/cellregmap/test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/cellregmap/test/old_test_struct_lmm2.py b/cellregmap/test/old_test_struct_lmm2.py deleted file mode 100644 index 24c690d..0000000 --- a/cellregmap/test/old_test_struct_lmm2.py +++ /dev/null @@ -1,99 +0,0 @@ -import pytest -import scipy.stats as stats -from numpy import array, asarray, eye, hstack, ones, sqrt, stack, tile, zeros -from numpy.random import RandomState -from numpy.testing import assert_allclose - -from struct_lmm2 import StructLMM2 -from struct_lmm2._simulate import ( - column_normalize, - create_environment_matrix, - sample_covariance_matrix, - sample_genotype, - sample_maf, - sample_persistent_effsizes, - create_variances, -) - - -@pytest.fixture -def data(): - seed = 0 - random = RandomState(seed) - n_samples = 100 - maf_min = 0.05 - maf_max = 0.45 - n_snps = 20 - # SNPs with persistent effect - g_snps = [5, 6] - # SNPs with interaction effect - gxe_snps = [10, 11] - # Contribution of interactions (proportion) - r0 = 0.8 - v0 = 0.4 - - # Simulate two environments - E = create_environment_matrix(n_samples) - - mafs = sample_maf(n_snps, maf_min, maf_max, random) - G = sample_genotype(n_samples, mafs, random) - # We normalize the columns of G so that we have 𝔼[𝐠ᵀ𝐠] = 1. - G = column_normalize(G) - - K = sample_covariance_matrix(n_samples, random) - v = create_variances(r0, v0) - beta_g = sample_effect_sizes(n_snps, g_snps, v["v_g"], random) - beta_gxe = sample_effect_sizes(n_snps, gxe_snps, v["v_gxe"], random) - y_g = G @ beta_g - - for i in range(n_snps): - # beta_gxe = random.multivariate_normal(zeros(n_samples), sigma_gxe[i] * Sigma) - beta_gxe = sigma_gxe[i] * u_gxe - y_gxe += G[:, i] * beta_gxe - - # return {"y": y, "W": W, "E": E, "K": K, "G": G} - - -@pytest.mark.skip("Not working yet.") -def test_structlmm_int_2kinships(): - random = RandomState(0) - n = 100 - c = 2 - y = random.randn(n) - M = ones((n, 1)) - W = random.randn(n, c) - E = random.randn(n, 4) - - p_values1 = [] - p_values2 = [] - - for i in range(10): - random = RandomState(i) - g = random.randn(n) - lmm2 = StructLMM2(y, W, E) - lmm2.fit_null(g) - _p = lmm2.score_2_dof(g) - p_values2.append(_p) - - y = y.reshape(y.shape[0], 1) - lmm = StructLMM(y, M, E) - g = g.reshape(g.shape[0], 1) - covs1 = hstack((W, g)) - lmm.fit(covs1) - _p = lmm.score_2dof_inter(g) - p_values1.append(_p) - - print(stats.pearsonr(p_values2, p_values1)[0]) - assert_allclose(p_values2, p_values1, rtol=1e-4) - - -@pytest.mark.skip("Not working yet.") -def test_structlmm2_interaction(data): - y = data["y"] - W = data["W"] - E = data["E"] - K = data["K"] - G = data["G"] - lmm2 = StructLMM2(y, W, E, K) - print(lmm2.scan_interaction(G)) - pass diff --git a/cellregmap/test/test_cellregmap.py b/cellregmap/test/test_cellregmap.py deleted file mode 100644 index e69de29..0000000 diff --git a/cellregmap/test/test_fixed_gxe.py b/cellregmap/test/test_fixed_gxe.py deleted file mode 100644 index 4327212..0000000 --- a/cellregmap/test/test_fixed_gxe.py +++ /dev/null @@ -1,137 +0,0 @@ -import numpy as np -from glimix_core.lmm import LMM -from numpy import arange, asarray, clip, concatenate, inf, median, newaxis -from numpy.random import RandomState, default_rng -from numpy_sugar import epsilon -from numpy_sugar.linalg import economic_qs, economic_qs_linear -from scipy.stats import chi2 - -from cellregmap import create_variances, sample_phenotype_fixed_gxe - - -def test_fixed_gxe(): - random = default_rng(20) - - n_individuals = 100 - # n_individuals = 200 - # n_individuals = 500 - # n_individuals = 250 - maf_min = 0.20 - # maf_min = 0.05 - maf_max = 0.45 - # n_snps = 250 - n_snps = 100 - # n_snps = 50 - # n_snps = 20 - # n_cells = 100 - # n_cells = 10 - # n_cells = 2 - n_cells = arange(n_individuals) + 1 - # n_cells = 1 - n_env_groups = 2 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - - v = create_variances(r0, v0) - - s = sample_phenotype_fixed_gxe( - offset=offset, - n_individuals=n_individuals, - n_snps=n_snps, - n_cells=n_cells, - n_env_groups=n_env_groups, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - QS = economic_qs_linear(s.Lk) - # QS = economic_qs(s.K) - - # Test 1: - # H0: s.y_e + s.y_k + s.y_n - # H1: s.y_g + s.y_e + s.y_k + s.y_n - y = s.offset + s.y_g + 0*s.y_gxe + s.y_e + s.y_k + s.y_n - M = concatenate([s.M, s.E], axis=1) - lmm = LMM(y, M, QS, restricted=False) - lmm.fit(verbose=False) - scanner = lmm.get_fast_scanner() - data1 = scanner.fast_scan(s.G) - # Asserting to make sure that DoF==1 - assert data1["effsizes1"].ndim == 1 - dof = 1 - lml0 = scanner.null_lml() - lml1 = data1["lml"] - pv = test1_pvalues = lrt_pvalues(lml0, lml1, dof) - breakpoint() - print(pv[5], pv[6], pv[10], pv[11], median(pv[12:])) - # arange : 4.782253229179153e-192 1.0433583537506347e-196 0.01091492319670193 0.0010927629879084017 4.8750903985107926e-05 - # 2 n_cells : 1.090975503699979e-08 4.899993877637627e-08 0.009198608906693148 0.4204345922908054 0.5335573850563654 - # 10 n_cells : 3.982864661957504e-08 1.9070517075766003e-08 0.09201571609949227 0.35721972899471366 0.543538454382785 - # 100 n_cells: 4.469523142654005e-08 3.0916502150797485e-09 0.07348502556111537 0.37843373021086246 0.5058335803047611 - - # Test 2: - # H0: s.y_g + s.y_e + s.y_k + s.y_n - # H1: s.y_gxe + s.y_g + s.y_e + s.y_k + s.y_n - y = s.offset + s.y_gxe + s.y_g + s.y_e + s.y_k + s.y_n - assert min(abs(y - s.y)) < 1e-10 - data2 = {"lml0": [], "lml1": []} - dof = 1 - for g in s.G.T: - g = g[:, newaxis] - M = concatenate([s.M, g, s.E], axis=1) - lmm = LMM(y, M, QS, restricted=False) - lmm.fit(verbose=False) - scanner = lmm.get_fast_scanner() - d = scanner.fast_scan(s.E * g) - # Asserting to make sure that DoF==1 - assert d["effsizes1"].ndim == 1 - lml0 = scanner.null_lml() - lml1 = d["lml"] - data2["lml0"].append(lml0) - data2["lml1"].append(lml1) - - lml0 = asarray(data2["lml0"]) - lml1 = concatenate(data2["lml1"]) - pv = test2_pvalues = lrt_pvalues(lml0, lml1, dof) - breakpoint() - print(pv[5], pv[6], pv[10], pv[11], median(pv[12:])) - # arange : 0.6084591769169483 3.6950357207986164e-10 3.457029241798351e-227 1.2283428436867437e-223 0.005415575098636543 - # 2 n_cells : 0.07359659661565397 0.4350647685877501 2.3937927457201096e-07 1.9917895864290987e-16 0.43616571604445775 - # 10 n_cells : 0.3210415979502985 0.007255621562641298 1.109487704555233e-32 1.0877864951200334e-44 0.17000683356223117 - # 100 n_cells: 0.25813564725186755 8.150568151907782e-22 2.2250738585072014e-308 2.2250738585072014e-308 0.00013995763429074527 - pass - -def compute_pvalues(): - stats["pv20"] = lrt_pvalues(stats["lml0"], stats["lml2"], stats["dof20"]) - -def lrt_pvalues(null_lml, alt_lmls, dof=1): - """ - Compute p-values from likelihood ratios. - - These are likelihood ratio test p-values. - - Parameters - ---------- - null_lml : float - Log of the marginal likelihood under the null hypothesis. - alt_lmls : array_like - Log of the marginal likelihoods under the alternative hypotheses. - dof : int - Degrees of freedom. - - Returns - ------- - pvalues : ndarray - P-values. - """ - lrs = clip(-2 * null_lml + 2 * asarray(alt_lmls, float), epsilon.super_tiny, inf) - pv = chi2(df=dof).sf(lrs) - return clip(pv, epsilon.super_tiny, 1 - epsilon.tiny) - diff --git a/cellregmap/test/test_math.py b/cellregmap/test/test_math.py deleted file mode 100644 index 75135de..0000000 --- a/cellregmap/test/test_math.py +++ /dev/null @@ -1,91 +0,0 @@ -import pytest -from celregmap._math import ( - P_matrix, - QSCov, - qmin, - rsolve, - score_statistic, - score_statistic_distr_weights, - score_statistic_liu_params, -) -from numpy import array, eye -from numpy.random import RandomState -from numpy.testing import assert_allclose -from numpy_sugar.linalg import economic_qs - - -@pytest.fixture -def data(): - random = RandomState(0) - n_samples = 3 - n_covariates = 2 - - W = random.randn(n_samples, n_covariates) - - K0 = random.randn(n_samples, n_samples) - K0 = K0 @ K0.T - - v = 0.2 - K = v * K0 + eye(n_samples) - dK = K0 - - alpha = array([0.5, -0.2]) - y = random.multivariate_normal(W @ alpha, K) - - return {"y": y, "W": W, "K": K, "dK": dK} - - -def test_QSCov(data): - rank = 2 - K = data["K"][:, :rank] @ data["K"][:, :rank].T - n_samples = K.shape[0] - QS = economic_qs(K) - a = 0.2 - b = 0.3 - - finalK = a * K + b * eye(n_samples) - - qscov = QSCov(QS[0][0], QS[1], a, b) - v = array([0.3, -0.2, 0.19]) - - assert_allclose(finalK @ v, qscov.dot(v)) - assert_allclose(rsolve(finalK, v), qscov.solve(v)) - - -def test_P_matrix(data): - P = array( - [ - [0.50355613, -0.24203676, -0.34880245], - [-0.24203676, 0.11633617, 0.16765363], - [-0.34880245, 0.16765363, 0.24160792], - ] - ) - assert_allclose(P_matrix(data["W"], data["K"]), P) - - -def test_score_statistic(data): - q = score_statistic(data["y"], data["W"], data["K"], data["dK"]) - assert_allclose(q, 0.49961017073389324) - - -def test_score_statistic_distr_weights(data): - weights = score_statistic_distr_weights(data["W"], data["K"], data["dK"]) - assert_allclose(weights, array([4.55266277e-09, 3.46249449e-01]), atol=1e-7) - - -def test_score_statistic_liu_params(data): - q = score_statistic(data["y"], data["W"], data["K"], data["dK"]) - weights = score_statistic_distr_weights(data["W"], data["K"], data["dK"]) - params = score_statistic_liu_params(q, weights) - assert_allclose(params["pv"], 0.22966744652848403) - assert_allclose(params["mu_q"], 0.34624945394475326) - assert_allclose(params["sigma_q"], 0.48967066729451103) - assert_allclose(params["dof_x"], 1.0) - - -def test_qmin(data): - params = [] - params += [{"pv": 0.22966742, "mu_q": 0.34945, "sigma_q": 0.48670, "dof_x": 1.5}] - params += [{"pv": 0.65, "mu_q": 0.695, "sigma_q": 0.1, "dof_x": 0.7}] - expected = [0.5506645025120773, 0.7157125486956082] - assert_allclose(qmin(params), expected) diff --git a/cellregmap/test/test_simulation.py b/cellregmap/test/test_simulation.py deleted file mode 100644 index 5c54d07..0000000 --- a/cellregmap/test/test_simulation.py +++ /dev/null @@ -1,217 +0,0 @@ -import pytest -from numpy import logical_and, ones, tile, zeros -from numpy.linalg import matrix_rank -from numpy.random import RandomState -from numpy.testing import assert_, assert_allclose, assert_equal - -from cellregmap._simulate import ( - column_normalize, - create_environment_matrix, - create_variances, - sample_covariance_matrix, - sample_genotype, - sample_gxe_effects, - sample_maf, - sample_noise_effects, - sample_persistent_effsizes, - sample_phenotype, -) - - -def test_sample_maf(): - random = RandomState(0) - n_snps = 30 - maf_min = 0.2 - maf_max = 0.3 - mafs = sample_maf(n_snps, maf_min, maf_max, random) - assert_(all(logical_and(maf_min <= mafs, mafs <= maf_max))) - assert_(len(mafs) == n_snps) - - -def test_sample_genotype(): - random = RandomState(0) - n_samples = 3 - n_snps = 30 - maf_min = 0.2 - maf_max = 0.3 - mafs = sample_maf(n_snps, maf_min, maf_max, random) - G = sample_genotype(n_samples, mafs, random) - assert_(G.shape == (n_samples, n_snps)) - - A = set(list(G.ravel())) - B = set([0.0, 1.0, 2.0]) - assert_(A - B == set()) - - -def test_column_normalize(): - random = RandomState(0) - n_samples = 10 - n_snps = 30 - maf_min = 0.2 - maf_max = 0.3 - mafs = sample_maf(n_snps, maf_min, maf_max, random) - G = sample_genotype(n_samples, mafs, random) - G = column_normalize(G) - assert_allclose(G.mean(0), zeros(n_snps), atol=1e-7) - assert_allclose(G.std(0), ones(n_snps)) - - -def test_sample_covariance_matrix(): - random = RandomState(0) - n_samples = 5 - K = sample_covariance_matrix(n_samples, random) - assert_(K.shape == (n_samples, n_samples)) - assert_allclose(K.diagonal().mean(), 1.0) - assert_(matrix_rank(K) == K.shape[0]) - - n_rep = 2 - K = sample_covariance_matrix(n_samples, random, n_rep) - assert_(K.shape == (n_rep * n_samples, n_rep * n_samples)) - assert_allclose(K.diagonal().mean(), 1.0) - assert_(matrix_rank(K) == K.shape[0]) - - -def test_variances(): - r0 = 0.1 - v0 = 0.5 - - v = create_variances(r0, v0) - assert_allclose(v.g + v.gxe + v.k + v.e + v.n, 1.0) - assert_allclose(v.e, v.n) - assert_allclose(v.n, v.k) - - has_kinship = False - v = create_variances(r0, v0, has_kinship) - assert_allclose(v.g + v.gxe + v.e + v.n, 1.0) - assert_allclose(v.e, v.n) - assert_equal(v.k, None) - - -def test_sample_persistent_effsizes(): - random = RandomState(0) - n_effects = 10 - causal_indices = [3, 5] - variance = 0.9 - - with pytest.raises(IndexError): - sample_persistent_effsizes(2, causal_indices, variance, random) - - beta = sample_persistent_effsizes(n_effects, causal_indices, variance, random) - assert_allclose(beta.mean(), 0.0, atol=1e-7) - assert_allclose((beta ** 2).sum(), variance) - - -def test_sample_gxe_effects(): - random = RandomState(0) - n_samples = 10 - n_snps = 30 - n_rep = 2 - maf_min = 0.2 - maf_max = 0.3 - mafs = sample_maf(n_snps, maf_min, maf_max, random) - - G = sample_genotype(n_samples, mafs, random) - G = tile(G, (n_rep, 1)) - G = column_normalize(G) - - E = random.randn(5, 5) - E = create_environment_matrix(E, n_samples, n_rep, 3, random) - E = column_normalize(E) - - causal_indices = [3, 5, 8] - variance = 0.9 - - y2 = sample_gxe_effects(G, E, causal_indices, variance, random) - assert_allclose(y2.mean(), 0.0, atol=1e-7) - assert_allclose(y2.var(), variance) - - -# TODO: put it back -# def test_sample_environment_effects(): -# random = RandomState(0) -# n_samples = 10 - -# E = random.randn(5, 5) -# E = create_environment_matrix(E, n_samples, 2, 3, random) -# E = column_normalize(E) - -# variance = 0.3 -# y3 = sample_environment_effects(E, variance, random) - -# assert_allclose(y3.mean(), 0.0, atol=1e-7) -# assert_allclose(y3.var(), variance) - - -# def test_sample_population_effects(): -# random = RandomState(0) -# n_samples = 10 -# variance = 0.4 -# n_rep = 1 -# K = sample_covariance_matrix(n_samples, random, n_rep) -# y4 = sample_population_effects(K, variance, random) - -# assert_allclose(y4.mean(), 0.0, atol=1e-7) -# assert_allclose(y4.var(), variance) - - -def test_sample_noise_effects(): - random = RandomState(0) - n_samples = 10 - - variance = 0.3 - y5 = sample_noise_effects(n_samples, variance, random) - - assert_allclose(y5.mean(), 0.0, atol=1e-7) - assert_allclose(y5.var(), variance) - - -def test_sample_phenotype(): - from numpy import corrcoef - - random = RandomState(0) - - r0 = 0.1 - v0 = 0.5 - v = create_variances(r0, v0) - - E = random.randn(30, 10) - - offset = 0.3 - n_samples = 500 - n_rep = 2 - n_env = 3 - s = sample_phenotype( - offset=offset, - E=E, - n_samples=n_samples, - n_snps=300, - n_rep=n_rep, - n_env=n_env, - maf_min=0.1, - maf_max=0.4, - g_causals=[3, 4], - gxe_causals=[4, 5], - variances=v, - random=random, - ) - assert_allclose( - s.y_g.var() + s.y_gxe.var() + s.y_n.var() + s.y_e.var() + s.y_k.var(), 1.0 - ) - assert_(abs(corrcoef(s.y_g, s.y_gxe)[0, 1]) < 0.1) - assert_(abs(corrcoef(s.y_g, s.y_n)[0, 1]) < 0.1) - assert_(abs(corrcoef(s.y_g, s.y_e)[0, 1]) < 0.1) - assert_(abs(corrcoef(s.y_g, s.y_k)[0, 1]) < 0.1) - - assert_(abs(corrcoef(s.y_gxe, s.y_n)[0, 1]) < 0.1) - assert_(abs(corrcoef(s.y_gxe, s.y_e)[0, 1]) < 0.1) - assert_(abs(corrcoef(s.y_gxe, s.y_k)[0, 1]) < 0.1) - - assert_(abs(corrcoef(s.y_n, s.y_e)[0, 1]) < 0.1) - assert_(abs(corrcoef(s.y_n, s.y_k)[0, 1]) < 0.1) - - assert_(abs(corrcoef(s.y_e, s.y_k)[0, 1]) < 0.1) - assert_allclose(s.y, offset + s.y_g + s.y_gxe + s.y_n + s.y_e + s.y_k) - assert_allclose(s.y.mean(), offset) - - assert_(s.E.shape[0] == n_samples * n_rep) - assert_(s.E.shape[1] == n_env) diff --git a/cellregmap/test/test_struct_lmm2.py b/cellregmap/test/test_struct_lmm2.py deleted file mode 100644 index 22aa2e6..0000000 --- a/cellregmap/test/test_struct_lmm2.py +++ /dev/null @@ -1,451 +0,0 @@ -from numpy import arange, asarray, eye, median, min, ones -from numpy.linalg import cholesky -from numpy.random import RandomState, default_rng -from numpy.testing import assert_, assert_allclose - -from cellregmap import create_variances, sample_phenotype, sample_phenotype_gxe - - -# from struct_lmm import StructLMM -# @pytest.mark.skip -def test_struct_lmm2_assoc(): - random = RandomState(0) - - n_samples = 50 - maf_min = 0.05 - maf_max = 0.45 - n_snps = 20 - n_rep = 1 - n_env = 2 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - - v = create_variances(r0, v0) - E = random.normal(size=[n_samples, n_env]) - - s = sample_phenotype( - offset=offset, - E=E, - n_samples=n_samples, - n_snps=n_snps, - n_rep=n_rep, - n_env=n_env, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - M = ones((n_samples, 1)) - # slmm = StructLMM(s.y.copy(), M=M, E=s.E, W=s.E) - # slmm.fit(verbose=False) - - # p_values0 = [] - # print() - # for i in range(n_snps): - # g = s.G[:, [i]] - # p = slmm.score_2dof_assoc(g) - # print("{}\t{}".format(i, p)) - # p_values0.append(p) - - slmm2 = StructLMM2(s.y, M, s.E) - slmm2.fit_null_association() - p_values1 = slmm2.scan_association(s.G) - # for i, pv in enumerate(p_values1): - # print("{}\t{}".format(i, pv)) - - -def test_struct_lmm2_inter(): - random = RandomState(3) - - n_samples = 280 - # n_samples = 500 - maf_min = 0.05 - maf_max = 0.45 - n_snps = 20 - n_rep = 1 - n_env = 4 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - E = random.normal(size=[n_samples, 2]) - - v = create_variances(r0, v0) - - s = sample_phenotype( - offset=offset, - E=E, - n_samples=n_samples, - n_snps=n_snps, - n_rep=n_rep, - n_env=n_env, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - M = ones((n_samples, 1)) - - # p_values0 = [] - # for i in range(n_snps): - # g = s.G[:, [i]] - # Mg = concatenate([M, g], axis=1) - # slmm_int = StructLMM(s.y.copy(), M=Mg, E=s.E, W=s.E) - # slmm_int.fit(verbose=False) - # p = slmm_int.score_2dof_inter(g) - # print("{}\t{}".format(i, p)) - # p_values0.append(p) - - # TODO: add kinship to test it properly - slmm2 = StructLMM2(s.y, M, s.E) - pvals = slmm2.scan_interaction(s.G) - - causal_pvalues = [pvals[i] for i in range(len(pvals)) if i in gxe_causals] - noncausal_pvalues = [pvals[i] for i in range(len(pvals)) if i not in gxe_causals] - causal_pvalues = asarray(causal_pvalues) - noncausal_pvalues = asarray(noncausal_pvalues) - - assert_(all(causal_pvalues < 1e-7)) - assert_(all(noncausal_pvalues > 1e-3)) - - -def test_struct_lmm2_inter_kinship(): - random = RandomState(8) - - n_samples = 346 - maf_min = 0.05 - maf_max = 0.45 - n_snps = 20 - n_rep = 1 - n_env = 3 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - E = random.normal(size=[n_samples, 2]) - - v = create_variances(r0, v0) - - s = sample_phenotype( - offset=offset, - E=E, - n_samples=n_samples, - n_snps=n_snps, - n_rep=n_rep, - n_env=n_env, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - M = ones((n_samples, 1)) - - G_kinship = cholesky(s.K + eye(n_samples) * 1e-7) - slmm2 = StructLMM2(s.y, M, s.E, G_kinship) - pvals = slmm2.scan_interaction(s.G) - - causal_pvalues = [pvals[i] for i in range(len(pvals)) if i in gxe_causals] - noncausal_pvalues = [pvals[i] for i in range(len(pvals)) if i not in gxe_causals] - causal_pvalues = asarray(causal_pvalues) - noncausal_pvalues = asarray(noncausal_pvalues) - - assert_(all(causal_pvalues < 1e-7)) - assert_(all(noncausal_pvalues > 1e-3)) - - -def test_struct_lmm2_inter_kinship_permute(): - random = RandomState(2) - - n_samples = 500 - maf_min = 0.05 - maf_max = 0.45 - n_snps = 20 - n_rep = 1 - n_env = 3 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - E = random.normal(size=[n_samples, 2]) - - v = create_variances(r0, v0) - - s = sample_phenotype( - offset=offset, - E=E, - n_samples=n_samples, - n_snps=n_snps, - n_rep=n_rep, - n_env=n_env, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - M = ones((n_samples, 1)) - - G_kinship = cholesky(s.K + eye(n_samples) * 1e-7) - slmm2 = StructLMM2(s.y, M, s.E, G_kinship) - random = RandomState(1) - idx = random.permutation(s.E.shape[0]) - pvals = slmm2.scan_interaction(s.G, idx_E=idx) - assert_(median(pvals) > 0.3) - assert_(min(pvals) > 0.04) - - -def test_struct_lmm2_inter_kinship_repetition(): - import numpy as np - - random = default_rng(20) - - n_individuals = 100 - # n_individuals = 200 - # n_individuals = 500 - # n_individuals = 250 - maf_min = 0.40 - # maf_min = 0.20 - # maf_min = 0.05 - maf_max = 0.45 - # n_snps = 250 - n_snps = 100 - # n_snps = 50 - # n_snps = 20 - # n_cells = 100 - # n_cells = 10 - # n_cells = 2 - n_cells = arange(n_individuals) + 1 - # n_cells = 1 - n_env = 2 - n_env_groups = 3 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - - v = create_variances(r0, v0) - - s = sample_phenotype( - offset=offset, - n_individuals=n_individuals, - n_snps=n_snps, - n_cells=n_cells, - n_env=n_env, - n_env_groups=n_env_groups, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - slmm2 = StructLMM2(s.y, s.M, s.E, s.Lk) - # idx = random.permutation(s.E.shape[0]) - # pvals = slmm2.scan_interaction(s.G, idx_E=idx) - from scipy.stats.stats import pearsonr - - corr = {} - corr_pvals = {} - - corr["y_e"] = [pearsonr(s.y_e, s.G[:, i])[0] for i in range(s.G.shape[1])] - corr_pvals["y_e"] = [pearsonr(s.y_e, s.G[:, i])[1] for i in range(s.G.shape[1])] - - corr["y_n"] = [pearsonr(s.y_n, s.G[:, i])[0] for i in range(s.G.shape[1])] - corr_pvals["y_n"] = [pearsonr(s.y_n, s.G[:, i])[1] for i in range(s.G.shape[1])] - - pvals, info = slmm2.scan_interaction(s.G) - print(pearsonr(pvals, np.abs(corr["y_e"]))) - print(pearsonr(pvals, np.abs(corr["y_n"]))) - assert_(median(pvals) > 0.3) - assert_(min(pvals) > 0.04) - - -def test_struct_lmm2_inter_kinship_repetition_gxe(): - import numpy as np - - random = default_rng(20) - - n_individuals = 100 - # n_individuals = 200 - # n_individuals = 500 - # n_individuals = 250 - maf_min = 0.40 - # maf_min = 0.20 - # maf_min = 0.05 - maf_max = 0.45 - # n_snps = 250 - n_snps = 100 - # n_snps = 50 - # n_snps = 20 - # n_cells = 100 - # n_cells = 10 - # n_cells = 2 - n_cells = arange(n_individuals) + 1 - # n_cells = 1 - n_env = 2 - n_env_groups = 3 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - - v = create_variances(r0, v0) - - # Timing: - # - n_individuals - # - n_env_groups - # - n_samples - - s = sample_phenotype_gxe( - offset=offset, - n_individuals=n_individuals, - n_snps=n_snps, - n_cells=n_cells, - n_env=n_env, - n_env_groups=n_env_groups, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - slmm2 = StructLMM2(s.y, s.M, s.E, s.Ls) - # idx = random.permutation(s.E.shape[0]) - # pvals = slmm2.scan_interaction(s.G, idx_E=idx) - from scipy.stats.stats import pearsonr - - corr = {} - corr_pvals = {} - - corr["y_e"] = [pearsonr(s.y_e, s.G[:, i])[0] for i in range(s.G.shape[1])] - corr_pvals["y_e"] = [pearsonr(s.y_e, s.G[:, i])[1] for i in range(s.G.shape[1])] - - corr["y_n"] = [pearsonr(s.y_n, s.G[:, i])[0] for i in range(s.G.shape[1])] - corr_pvals["y_n"] = [pearsonr(s.y_n, s.G[:, i])[1] for i in range(s.G.shape[1])] - - pvals, info = slmm2.scan_interaction(s.G) - print(pearsonr(pvals, np.abs(corr["y_e"]))) - print(pearsonr(pvals, np.abs(corr["y_n"]))) - assert_(median(pvals) > 0.3) - assert_(min(pvals) > 0.04) - - -def test_struct_lmm2_inter_kinship_predict(): - random = RandomState(0) - - n_individuals = 100 - maf_min = 0.05 - maf_max = 0.45 - n_snps = 20 - n_cells = 2 - n_env_groups = 3 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - - v = create_variances(r0, v0) - - s = sample_phenotype_gxe( - offset=offset, - n_individuals=n_individuals, - n_snps=n_snps, - n_cells=n_cells, - n_env_groups=n_env_groups, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - slmm2 = StructLMM2(s.y, s.M, s.E, s.Ls) - beta_g_s, beta_gxe_s = slmm2.predict_interaction(s.G, s.mafs) - assert_allclose(beta_g_s[3], -0.07720025290188615) - assert_allclose(beta_g_s[-1], 0.022415332334122483) - assert_allclose(beta_gxe_s[1, 1], 0.010062608120425824) - assert_allclose(beta_gxe_s[13, -1], -0.05566938579548831) - - -def test_struct_lmm2_estimate_aggregate_environment(): - random = default_rng(20) - - n_individuals = 100 - # n_individuals = 200 - # n_individuals = 500 - # n_individuals = 250 - maf_min = 0.40 - # maf_min = 0.20 - # maf_min = 0.05 - maf_max = 0.45 - # n_snps = 250 - n_snps = 100 - # n_snps = 50 - # n_snps = 20 - # n_cells = 100 - # n_cells = 10 - n_cells = 2 - # n_cells = arange(n_individuals) + 1 - # n_cells = 1 - n_env_groups = 3 - offset = 0.3 - r0 = 0.5 - v0 = 0.5 - g_causals = [5, 6] - gxe_causals = [10, 11] - - v = create_variances(r0, v0) - - # Timing: - # - n_individuals - # - n_env_groups - # - n_samples - - s = sample_phenotype_gxe( - offset=offset, - n_individuals=n_individuals, - n_snps=n_snps, - n_cells=n_cells, - n_env_groups=n_env_groups, - maf_min=maf_min, - maf_max=maf_max, - g_causals=g_causals, - gxe_causals=gxe_causals, - variances=v, - random=random, - ) - - slmm2 = StructLMM2(s.y, s.M, s.E, s.Ls) - import numpy as np - for i in range(5): - a = slmm2.estimate_aggregate_environment(s.G[:, i]) - print(np.var(a)) - b = slmm2.estimate_aggregate_environment(s.G[:, 10]) - c = slmm2.estimate_aggregate_environment(s.G[:, 11]) - print(np.var(b)) - print(np.var(c)) - pass diff --git a/citation.md b/citation.md new file mode 100644 index 0000000..aea4cdb --- /dev/null +++ b/citation.md @@ -0,0 +1,18 @@ +--- +layout: default +title: Citation +--- + +If you use our CellRegMap model in your work, please cite our [preprint](https://www.biorxiv.org/content/10.1101/2021.09.01.458524v1) as: + +Cuomo ASE et al., CellRegMap: A statistical framework for mapping context-specific regulatory variants using scRNA-seq, bioRxiv, 2021 + +## bibtex citation + + @article{Cuomo2021, + author = {Cuomo, A.S.E. and Heinen, T. and Vagiaki, D. and Horta, D. and Marioni, J.C. and Stegle, O.}, + title = {CellRegMap: A statistical framework for mapping context-specific regulatory variants using scRNA-seq}, + journal = {bioRxiv}, + year = {2021}, + url = {https://www.biorxiv.org/content/10.1101/2021.09.01.458524v1} + } diff --git a/contact.md b/contact.md new file mode 100644 index 0000000..1ce075e --- /dev/null +++ b/contact.md @@ -0,0 +1,46 @@ +--- +layout: default +title: "Contact" +--- + +## Software maintainers + + + + + +
+
+
+ missing +
Anna Cuomo
Postdoc Scientist
Garvan Institute
a.cuomo@garvan.org.au
+
+
+
+
+ missing +
Danilo Horta
Software Developer
EBI
horta@ebi.ac.uk
+
+
+ +
+
+ missing +
Tobias Heinen
PhD Student
DKFZ
t.heinen@dkfz.de
+
+
+ +
diff --git a/dist/cellregmap-0.0.2.tar.gz b/dist/cellregmap-0.0.2.tar.gz new file mode 100644 index 0000000..933055c Binary files /dev/null and b/dist/cellregmap-0.0.2.tar.gz differ diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index d4bb2cb..0000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 566feac..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,61 +0,0 @@ -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = 'CellRegMap' -copyright = '2022, Anna Cuomo' -author = 'Anna Cuomo' - - -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.autosummary", - "sphinx.ext.napoleon", - "sphinx.ext.viewcode", -] - -autodoc_default_flags = ["members"] -autosummary_generate = True -napoleon_numpy_docstring = True - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -pygments_style = "default" -html_theme = "sphinx_rtd_theme" - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] diff --git a/docs/functions.rst b/docs/functions.rst deleted file mode 100644 index 191bcce..0000000 --- a/docs/functions.rst +++ /dev/null @@ -1,15 +0,0 @@ -********** -CellRegMap -********** - -.. currentmodule:: cellregmap - -.. autosummary:: - :toctree: _autosummary - :template: class.rst - - CellRegMap - run_association - run_interaction - estimate_betas - # lrt_pvalues diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 4610cec..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,39 +0,0 @@ -.. CellRegMap documentation master file, created by - sphinx-quickstart on Mon Jan 10 23:46:16 2022. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to CellRegMap's documentation! -====================================== - -What is CellRegMap ------------------- - -The cellular regulatory map (``cellregmap``) is a statistical framework to map context-specific effects of genetic variants on single-cell gene expression. -It builds on a linear mixed model and is implemented in Python. - -What do you need to run CellRegMap ----------------------------------- - -Single-cell expression profiles assayed through scRNA-seq in the form of a count matrix (cells x genes) - -Cellular contexts: these can be known factors or a latent representation of the space (e.g., PCs by cells) - -Genotypes: CellRegMap tests for individual eQTL effects between a genetic variant and a gene's expression, so genotypes need to be provided for variants of interest. - -A kinship matrix accounting for relatedness among samples. - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - install - functions - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/install.rst b/docs/install.rst deleted file mode 100644 index cb55400..0000000 --- a/docs/install.rst +++ /dev/null @@ -1,26 +0,0 @@ -****************** -Install CellRegMap -****************** - -Stable version --------------- - -``cellregmap`` can be installed `from PyPI `_ with ``pip``: -:: - python3 -m pip install cellregmap - - -Development version -------------------- - -To install ``cellregmap`` in development mode, install it from the `GitHub repository `_: -:: - git clone https://github.com/limix/CellRegMap.git - cd CellRegMap - python3 -m pip install -e - - -Docker installation -------------------- - -If you use Docker, you can also pull the `pre-build image from dockerhub `_ diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 153be5e..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 3bcf96c..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -sphinx>=2.1.2 -# cellregmap -git+https://github.com/limix/CellRegMap.git diff --git a/images/Cuomo_Anna.png b/images/Cuomo_Anna.png new file mode 100755 index 0000000..d431490 Binary files /dev/null and b/images/Cuomo_Anna.png differ diff --git a/images/Horta_Danilo.jpeg b/images/Horta_Danilo.jpeg new file mode 100644 index 0000000..99dbfa3 Binary files /dev/null and b/images/Horta_Danilo.jpeg differ diff --git a/images/Tobis_photo.jpeg b/images/Tobis_photo.jpeg new file mode 100644 index 0000000..0ed2fab Binary files /dev/null and b/images/Tobis_photo.jpeg differ diff --git a/index.md b/index.md new file mode 100644 index 0000000..25c0e6d --- /dev/null +++ b/index.md @@ -0,0 +1,14 @@ +--- +layout: default +title: CellRegMap +--- + +# CellRegMap + +The Cellular Regulatory Map (CellRegMap) is a linear mixed model approach to test for and characterize context-specific eQTL variants across several cell contexts and states. +CellRegMap leverages single cell RNA sequencing (scRNA-seq) data, and does not require discretization of cells into cell groups. +CellRegMap builds on the previously proposed [StructLMM](https://www.nature.com/articles/s41588-018-0271-0) but importantly can account for repeated observations for the same samples, _e.g._, multiple cells for the same donor, and for population stratification and relatedness. + +![Fig1](https://user-images.githubusercontent.com/25035866/132485517-0e111014-1b80-42f1-bd8b-b9ca0750bd23.png) + +For more details on the CellRegMap model and its applications to both real and simulated data, see our [paper](https://www.embopress.org/doi/full/10.15252/msb.202110663). diff --git a/input_files.md b/input_files.md new file mode 100644 index 0000000..fcc0076 --- /dev/null +++ b/input_files.md @@ -0,0 +1,162 @@ +--- +layout: default +title: "Input Files" +mathjax: true +--- + +# The CellRegMap model + +The CellRegMap model can be cast as: + +$y = W\alpha + g\beta_G + g \odot \beta_{GxC} + c + u + \epsilon$, + +where + +$\beta_{GxC} \sim \mathcal{N} (0, \sigma^2_{GxC}CC^T)$, + +$c \sim \mathcal{N} (0, \sigma^2_{C}CC^T)$, + +$u \sim \mathcal{N} (0, \sigma^2_{KC}(CC^T \odot K))$, and + +$\epsilon \sim \mathcal{N} (0, \sigma^2_n I)$ + + + + +## Brief description of the model terms + + +The following terms should be provided as input files: + +* **Phenotype vector ($y$)** - in the linear mixed model, this is the outcome variable. +In eQTL mapping, this represents expression level of a given gene of interest, across samples. +The main application of CellRegMap is using scRNA-seq data, in which case this will be a column vector, with length corresponding to the number of cells considered. +For optimal fit with the model (which assumes a Gaussian distribution) we recommend [quantile normalising](https://github.com/limix/limix/blob/master/limix/qc/_quant_gauss.py) this vector, or at least [standardising](https://github.com/limix/limix/blob/master/limix/qc/_mean_std.py) it. + +* **Genotype vector ($g$)** - SNP vector. +This represents the genotype of each sample at the genomic locus of interest, and is typically modelled as 0, 1 or 2, representing the number of minor alleles (however, the model can also handle a continuous vector of dosages). +Note that a genotype file is well defined at the level of donors, and needs to be appropriately [expanded](https://github.com/annacuomo/CellRegMap_analyses/blob/main/endodiff/preprocessing/Expand_genotypes_kinship.ipynb) across cells. +It is also possible to input a matrix $G$ whose columns represent multiple SNPs ($g$'s) to be tested for that gene (see "Notes" below). + +* **Cellular context matrix ($C$)** - cellular environment/context matrix. +Rows are cells, columns are values across the different cellular contexts. +Columns of C can for example be principal components, or other latent factor representations of the data (e.g., using MOFA [1], ZINB-WaVE [2] or LDVAE [3]), binary vector encoding assignment to different cellular groups such as cell types, or any other factor, including environmental exposures, or disease state. +Best practice is to column-standardise this matrix. + +* **Kinship matrix ($K$)**, or its decomposition ($hK$, such that $K = hK @ hK^T$), a sample covariance, often the so-called [kinship](https://www.cog-genomics.org/plink/1.9/distance) (or genetic relationship matrix; GRM) matrix, appropriately [expanded](https://github.com/annacuomo/CellRegMap_analyses/blob/main/endodiff/preprocessing/Expand_genotypes_kinship.ipynb) across cells. + + + + +* **Covariate matrix ($W$)** - any additional fixed effect terms to include in the model, such as sex or age. +If no such terms are needed an intercept of ones should be provided. + + + +The following terms will be estimated by the model: + +* **SNP effect sizes**, both due to persistent effects ($\beta_G$) and to GxC interactions ($\beta_{GxC}$) can be estimated using the estimate_betas() function, see [usage page](https://limix.github.io/CellRegMap/usage.html). + +* **other inferred parameters** ($\alpha$, $\sigma^2$ values) are estimated by the model but not returned as values. + +# Notes + +## Necessary inputs +The model will not run if one of **y, W, g** or **C** is not provided as input. + +* The following terms are absolutely necessary: expression phenotypes (**y**), genotypes (**g**), and cellular contexts (**C**). + +* A kinship matrix (**K**; or its decomposition **hK**, such that K = hK @ hK.T) is highly recommended, to appropriately account for sample structure, especially the repeatedness across cells from the same individual. + * if you do not have access to a GRM, consider providing a block diagonal sample covariance, with blocks corresponding to individuals. + * If K (or hK) is not provided, CellRegMap becomes equivalent to [StructLMM](https://limix.github.io/CellRegMap/structlmm.html). + + +* If no covariates (W) are necessary, simply provide a vector of [ones](https://numpy.org/doc/stable/reference/generated/numpy.ones.html) as an intercept term. + + +## Each SNP-gene pair should be tested independently +The test is run independently for each gene-SNP pair, thus in the model above, **y** and **g** are one-dimensional vectors, representing i) the expression of a single gene and ii) the genotypes at a single SNP, respectively. + +* The implementation does allow for multiple SNPs to be tested for a given gene, this can be achieved by providing a matrix G of which each column is a different SNP G=[g_1, .. g_n]. +In this case, the model **simply loops over each SNP and tests one at the time**, then returns a list of p-values, one per SNP. + +* On the other hand, each gene needs to be tested separately, as CellRegMap cannot take the full expression matrix as input. + +As tests are independent, we recommend parallelising as much as possible, for example submitting independent jobs for each chromosome, gene, or even gene-SNP pair. + +## Covariates, cell contexts and repeatedness are fixed +W, C, hK (and thus K) remain the same across all tests (_i.e._, across all SNP-gene pairs). + +# Dimensionality + +Specified dimensionality for each of the terms, where n is the total number of cells: + +* **y**: n x 1 (only one gene tested at a time) +* **W**: n x c, where c is the number of fixed effect covariates (_e.g._, age, sex..) +* **C**: n x k, where k is the number of contexts to test for interactions +* **G**: n x s, where s is the number of SNPs to be tested for a given gene +* **hK**: n x p, where p is the number of individuals, decomposition of the n x n kinship matrix K + + + + +# Normalization + +For optimal model fit, we recommend standardizing or quantile normalizing (to a standard normal distribution) the phenotype vector **y** and column-standardizing the cellular contexts **C**. +Standardization refers to a transformation of a vector to have 0 mean and standard deviation 1. You can use [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) for this task. +Quantile normalization is a rank-normalization which enforces a standard normal distribution of the vector provided. +For an implementation of quantile-normalization see [here](https://github.com/limix/limix/blob/master/limix/qc/_quant_gauss.py). + + + +# Pseudocells + +This approach refers to the action of grouping together small numbers of similar cells into "pseudocells" to reduce issues due to sparsity and speed up computations by reducing sample size. +Existing implementations include [Metacell](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1812-2) and the [micro pooling approach](https://yoseflab.github.io/VISION/articles/micropooling.html) within the [Vision](https://www.nature.com/articles/s41467-019-12235-0) pipeline. +Those approaches do not directly take into account the presence of several genetically distinct donors, which is important here. +To address this, we recommend using one of these approaches for each donor separately. +For an implementation of how we computed meta-cells in the CellRegMap manuscript (for the neuronal differentiation data analysis), see [here](https://github.com/annacuomo/CellRegMap_analyses/blob/main/neuroseq/preprocessing/create_metacells.py). + + +# Multiple testing correction + +Since thousands of tests are typically run, [multiple testing correction](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) of the test p-values is necessary. +Below, we provide guidelines for how to correct for multiple testing for the two main tests implemented in CellRegMap. +Also refer to workflow [here](https://github.com/annacuomo/CellRegMap_analyses/blob/main/endodiff/usage/README.md) + +## Association test + +Run discovery, two-step multiple testing correction, 1) within gene across SNPs (FWER), 2) across genes (FDR). + + +## Interaction test + +Only one SNP per gene, or at least independent. +If one SNP per gene straight to step 2 (FDR), if multiple but independent Bonferroni as step 1, then step 2. + +# References + +[1] Argelaguet\*, Velten\* et al., Molecular Systems Biology, 2018 (MOFA: multi-omics factor analysis) - [link](https://www.embopress.org/doi/full/10.15252/msb.20178124) + +[2] Risso et al, Nature Communications, 2018 (ZINB-WaVE: zero-inflated negative binomial-based Wanted Variation Extraction) - [link](https://www.nature.com/articles/s41467-017-02554-5) + +[3] Svensson et al, Bioinformatics, 2020 (LDVAE: linearly decoded variational autoencoder) - [link](https://academic.oup.com/bioinformatics/article/36/11/3418/5807606) + + + + + + diff --git a/installation.md b/installation.md new file mode 100644 index 0000000..17a8816 --- /dev/null +++ b/installation.md @@ -0,0 +1,43 @@ +--- +layout: default +title: "Installation" +--- + +## Stable release (easiest) + +CellRegMap is implemented as a Python package. +To install CellRegMap using the pip python installer, enter: + + pip install cellregmap + +in your command line. + +## Developmental mode + +To use the latest features of CellRegMap you can install the latest version from GitHub by entering: + + git clone https://github.com/limix/CellRegMap.git + cd CellRegMap + pip install -e . + +in your command line. + +## Installation using a Docker image +If you use Docker, you can also pull the [pre-build image from dockerhub](https://hub.docker.com/r/annasecuomo/cellregmap). + + + + diff --git a/limix.md b/limix.md new file mode 100644 index 0000000..121e198 --- /dev/null +++ b/limix.md @@ -0,0 +1,22 @@ +## Relation to LIMIX + +CellRegMap's linear mixed model (LMM) uses the FaST-LMM (Factored Spectrally Transformed Linear Mixed Model) implementation described [here](https://www.nature.com/articles/nmeth.1681) and used within the [LIMIX](https://github.com/limix/limix) framework. + +### Linear Mixed Model implementation using LIMIX +LIMIX is described in [this preprint](https://www.biorxiv.org/content/10.1101/003905v2) and documentation can be found [here](https://limix-tempdoc.readthedocs.io/en/latest/). +While there are several fast implementations to map eQTLs using correlation or linear regression-based approaches (e.g., [tensorQTL](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-019-1836-7), [matrix eQTL](https://academic.oup.com/bioinformatics/article/28/10/1353/213326?login=true)), to our knowledge LIMIX is the fastest software out there for eQTL association analyses using **linear mixed models** (which allow to better model population stratification and cryptic relatedness, which are prevalent in human genetic data - see for example thread [here](https://twitter.com/shaicarmi/status/1508298704796663808?s=21&t=6xaF5BmozHil3VbXotlGhQ)). + +### eQTL mapping using LIMIX +Specifically, for a LIMIX wrapper to map eQTLs, see the [limix QTL pipeline](https://github.com/single-cell-genetics/limix_qtl) developed in our lab, which we most recently used in our [Genome Biology Publication](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02407-x). + +The pipeline provides an easy wrapper to map eQTLs using various methods, automatically re-ordering, subselecting and expanding files to match with each other. +See the wiki pages for [installation](https://github.com/single-cell-genetics/limix_qtl/wiki/Installation) and [input files](https://github.com/single-cell-genetics/limix_qtl/wiki/Inputs), and an example [snakemake for standard eQTL mapping](https://github.com/single-cell-genetics/limix_qtl/wiki/QTL-mapping-on-small-chunks-using-snakemake). + +### Coming Soon +With [Marc Jan Bonder](https://twitter.com/mjbonder), we are in the process of implementing CellRegMap runners compatible with this pipeline. + +Also see our sister project [scDALI](https://pmbio.github.io/scdali/), a model for modelling allelic imbalance in single cells. + + diff --git a/proof.md b/proof.md deleted file mode 100644 index 142ee3e..0000000 --- a/proof.md +++ /dev/null @@ -1,79 +0,0 @@ -K = GGt, Sigma = EEt -M = K*Sigma - -Let & be the Hadamard product. - -We known that: - - rank(A&B) <= rank(A) * rank(B) - -Let - - K = U @ Dk @ Ut - Sigma = V @ Ds @ Vt - -for diagonal matrices Dk and Ds. - -K & Sigma = K & (Sum_i vi * lambda_i * vi^T ) = Sum_i (K & (vi @ lambda_i @ vi^T)) - = Sum_i ( Lki @ Lki^T ) - -Let e = |1 1 ... 1|. We have - - K & (vi @ lambda_i @ vi^T) = K & (ui @ ui^T) - = K & (Diag(ui) @ e @ e^T @ Diag(ui)) - = Diag(ui) @ (K & (e @ e^T)) @ Diag(ui) (See (2.10), [1]) - = Diag(ui) @ K @ Diag(ui) - = Diag(ui) @ G @ Gt @ Diag(ui) - = Lki @ Lki^T - -for Lki = Diag(ui) @ G. - -Trial ------ - -```python - import numpy as np - - K = np.random.randn(3, 3) - K = K @ K.T - - # rank-2 symmetric matrix - u0 = np.random.randn(3, 1) - u1 = np.random.randn(3, 1) - Sigma = u0 @ u0.T + u1 @ u1.T - sigma, U = np.linalg.eigh(Sigma) - - tmp = sigma[1] * U[:, [1]] @ U[:, [1]].T + sigma[2] * U[:, [2]] @ U[:, [2]].T - # Close to zero - print(tmp - Sigma) - - u1 = np.sqrt(sigma[1]) * U[:, [1]] - u2 = np.sqrt(sigma[2]) * U[:, [2]] - - tmp = u1 @ u1.T + u2 @ u2.T - Sigma - # Close to zero - print(tmp) - - e = np.ones(3) - - G = np.linalg.cholesky(K) - - Lk1 = np.diag(u1.ravel()) @ G - Lk2 = np.diag(u2.ravel()) @ G - - # Close to zero - print(Lk1 @ Lk1.T + Lk2 @ Lk2.T - K * Sigma) -``` - -Goal ----- - -- tr(K & Sigma) (there is a property) -- det(K & Sigma) -- (K & Sigma)^{-1} @ x - -References ----------- - -[1] Hadamard Products and Multivariate Statistical Analysis. - https://core.ac.uk/download/pdf/82272966.pdf diff --git a/references/greater power and computational efficiency.pdf.bz2 b/references/greater power and computational efficiency.pdf.bz2 deleted file mode 100644 index c98aefa..0000000 Binary files a/references/greater power and computational efficiency.pdf.bz2 and /dev/null differ diff --git a/references/rachel thesis part 1.pdf.bz2 b/references/rachel thesis part 1.pdf.bz2 deleted file mode 100644 index 3e97cc1..0000000 Binary files a/references/rachel thesis part 1.pdf.bz2 and /dev/null differ diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index ebfda63..0000000 --- a/setup.cfg +++ /dev/null @@ -1,69 +0,0 @@ -[metadata] -author = Anna Cuomo, Danilo Horta -author_email = acuomo@ebi.ac.uk -classifiers = - Development Status :: 5 - Production/Stable - License :: OSI Approved :: MIT License - Operating System :: OS Independent - Programming Language :: Python -description = A linear mixed model framework to map multivariate context-specific eQTL using single-cell RNA-seq data. -download_url = https://github.com/limix/CellRegMap -keywords = scRNA-seq, eQTL, context-specific regulation -license = MIT -long_description = file: README.md -long_description_content_type = text/markdown -maintainer = Danilo Horta -platforms = Windows, MacOS, Linux -maintainer_email = horta@ebi.ac.uk -name = cellregmap -url = https://github.com/limix/CellRegMap -version = attr: version.get - -[options] -zip_safe = True -include_package_data = True -packages = find: -python_requires = >= 3.6 -install_requires = - chiscore>=0.2.3 - glimix-core>=3.1.12 - numpy-sugar>=1.5.1 - numpy>=1.17.5 - pytest>=5.4.3 - scipy>=1.2.3 - tqdm>=4.41.1 - -[aliases] -test = pytest - -[tool:pytest] -addopts = - --doctest-modules - --doctest-glob='*.rst' - --ignore="setup.py" -doctest_plus = enabled -doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS ALLOW_UNICODE -doctest_rst = enabled -norecursedirs = .eggs .git *.egg-info build .ropeproject .undodir old_files - -[flake8] -ignore = E501,E741,E203,W503,W0212,W0622,R0915,E743 - -[pycodestyle] -ignore = E741,E743,E203 -max-line-length = 88 - -[tool:isort] -multi_line_output=3 -include_trailing_comma=True -force_grid_wrap=0 -combine_as_imports=True -line_length=88 - -[pylint] -disable = redefined-builtin,R0915 - -[rstcheck] -ignore_substitutions = today, version -ignore_directives = plot, autofunction, command-output, autmodule, automodule, autoclass, autoattribute, automethod, doctest -ignore_messages = Error in "math" directive diff --git a/setup.py b/setup.py deleted file mode 100644 index 2553605..0000000 --- a/setup.py +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env python - -from setuptools import setup - -if __name__ == "__main__": - setup() diff --git a/structlmm.md b/structlmm.md new file mode 100644 index 0000000..e3e07c6 --- /dev/null +++ b/structlmm.md @@ -0,0 +1,25 @@ +--- +layout: default +title: "StructLMM" +--- + +## Relation to StructLMM model +CellRegMap builds on and extends the structured linear mixed model (StructLMM) model, proposed in [Moore\*, Casale\* et al, 2018](https://www.nature.com/articles/s41588-018-0271-0), in the context of population genetics. +StructLMM allows to test for GxE effects across multiple environmental exposures at once, extending traditional interaction models which can only consider one environment at a time. + +However, StructLMM is not designed to deal with repeated or related samples. +Thus, it is not well suited to model longitudinal data (where multiple observations from the same individuals are collected over time) or single-cell data (where multiple cells are collected from the same individual), nor can it optimally model population stratification and cryptic relatedness, which have been shown to be prevalent in population genetic data. +CellRegMap overcomes this by including an additional random effect term that models relatedness across samples. + +Nevertheless, the original StructLMM model can be run using CellRegMap, by simply setting the repeatedness term to None, i.e.: + + hK=None + +and then running the model similarly to what is described in the [usage page](https://limix.github.io/CellRegMap/usage.html), i.e.: + + from CellRegMap import run_interaction + + pv_slmm = run_interaction(y=y, W=W, E=E, G=g, hK=None)[0] + print(f'StructLMM interaction test p-value: {pv_slmm}') + +where we note that "E" is used here instead of "C" as typically this model will be applied in the context of population genetics to test for effects with environmental exposures, as opposed to the cellular contexts generally considered in applications of CellRegMap to scRNA-seq data. diff --git a/tutorials.md b/tutorials.md new file mode 100644 index 0000000..fdd32e5 --- /dev/null +++ b/tutorials.md @@ -0,0 +1,14 @@ +--- +layout: default +title: "Tutorials" +--- + +For basic usage on toy data, see the [Usage page](https://limix.github.io/CellRegMap/usage.html) + + +For applications of CellRegMap described in [our manuscript](https://www.embopress.org/doi/full/10.15252/msb.202110663), see +* [Application on simulated data & Simulation strategy](https://github.com/annacuomo/CellRegMap_analyses/tree/main/simulations) +* [Application to single-cell dataset of iPSCs differentiating towards endoderm](https://github.com/annacuomo/CellRegMap_analyses/tree/main/endodiff) +* [Application to single-cell dataset of iPSCs differentiating towards dopaminergic neurons](https://github.com/annacuomo/CellRegMap_analyses/tree/main/neuroseq) + +More to come soon! diff --git a/usage.md b/usage.md new file mode 100644 index 0000000..9d2ba43 --- /dev/null +++ b/usage.md @@ -0,0 +1,104 @@ +--- +layout: default +title: "Usage" +--- + +There are three main functions that can be run within the CellRegMap package (detailed in our [docs](https://cellregmap.readthedocs.io/)): + +* Association test +* Interaction test +* Estimation of effect sizes + +## Association test (persistent effects) + +The main functionality of CellRegMap is to investigate genotype-context (GxC) interactions and identify context-specific genetic effects on expression in cohort-scale single-cell data (see **Interaction test** below). +However, to improve scalability, we recommend running the (computationally more intensive) interaction-test function only on a set of candidate eQTLs. +In the [original CellRegMap paper](https://www.biorxiv.org/content/10.1101/2021.09.01.458524v1) we consider eQTLs previously identified in the original studies[1,2], but it is now also possible to test for persistent eQTL effects within the CellRegMap framework itself, using the association-test function. +In this case, the model can be cast as: + +$y = W\alpha + g\beta_G + c + u + \epsilon$, + +which is similar to the main model except for the GxC term, which is missing. Here, we test for a persistent effect only, i.e., $\beta_G \neq 0$. + +CellRegMap function: _run_association()_ + +## Interaction test (GxC effects) + +This is the main test implemented in CellRegMap, where we test for GxC effects across cellular states and individual SNP variants. +In this case we consider the full model: + +$y = W\alpha + g\beta_G + g \odot \beta_{GxC} + c + u + \epsilon$ + +and test for $\beta_{GxC} \neq 0$. +While in principal any SNP-gene pairs can be tested for GxC effects, we recommend running this test on a set of candidate eQTLs (either known a priori or identified using the **Association test** described above), or interesting (e.g., disease-linked) variants to improve statistical power. + +CellRegMap function: _run_interaction()_ + +## Estimation of effect sizes + +Finally, CellRegMap can be used to estimate cell-level effect sizes driven by GxC effects for individual eQTLs ($\beta_{GxC}$), thus predicting the cells where those effects are detected. +Generally, it makes sense to use this function to characterise eQTLs that show evidence of GxC effects, i.e., for which significant p-values were obtained when using the **Interaction test** above. +The model is the same except for the term $c$, which is now modelled as fixed effects in order to estimate the GxC term itself. + +CellRegMap function: _estimate_betas()_ + +For more details on the tests above and underlying assumptions we refer the reader to the Supplementary Methods available as part of the [paper's Appendix](https://www.embopress.org/action/downloadSupplement?doi=10.15252%2Fmsb.202110663&file=msb202110663-sup-0001-Appendix.pdf). + +## Simple usage example + +The model is implemented in [python](https://www.python.org). +All vectors and matrices should be provided as [numpy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html), and there should be no flat (one-dimensional) arrays. +If the shape of a vector is (n,) please [reshape](https://numpy.org/doc/stable/reference/generated/numpy.reshape.html) to (n,1). + +Below, see a simple usage example with toy inputs: + + from numpy import ones + from numpy.random import RandomState + + from cellregmap import run_association, run_interaction, estimate_betas + + random = RandomState(1) + n = 30 # number of samples (cells) + p = 5 # number of individuals + k = 4 # number of contexts + y = random.randn(n, 1) # outcome vector (expression phenotype, one gene only) + C = random.randn(n, k) # context matrix (cells by contexts/factors) + W = ones((n, 1)) # intercept (covariate matrix) + hK = random.randn(n, p) # decomposition of kinship matrix (K = hK @ hK.T) + g = 1.0 * (random.rand(n, 1) < 0.2) # SNP vector + + ## Association test + pv0 = run_association(y=y, G=g, W=W, E=C, hK=hK)[0] + print(f'Association test p-value: {pv0}') + + ## Interaction test + pv = run_interaction(y=y, G=g, W=W, E=C, hK=hK)[0] + print(f'Interaction test p-value: {pv}') + + # Effect sizes estimation + betas = estimate_betas(y=y, G=g, W=W, E=C, hK=hK) + beta_G = betas[0] # persistent effect (scalar) + beta_GxC = betas[1][0] # GxC effects (vector) + + print(f'persistent genetic effect (betaG): {betaG}') + print(f'cell-level effect sizes due to GxC (betaGxC): {betaGxC}') + + +For more guidelines and instructions on how to construct input files from real data, please visit the [Input Files page](https://limix.github.io/CellRegMap/input_files.html). + +For a workflow description language ([WDL](https://openwdl.org/)) pipeline to run CellRegMap across thousands of genes check the pipeline's [Github page](https://github.com/populationgenomics/cellregmap-pipeline) and [Dockerhub repo](https://hub.docker.com/r/annasecuomo/cellregmap_pipeline). + +## References + +1. Cuomo\*, Seaton\*, McCarthy\* et al, Nature Communications, 2020 - [Single-cell RNA-sequencing of differentiating iPS cells reveals dynamic genetic effects on gene expression](https://www.nature.com/articles/s41467-020-14457-z) + +2. Jerber\*, Seaton\*, Cuomo\* et al, Nature Genetics, 2021 - [Population-scale single-cell RNA-seq profiling across dopaminergic neuron differentiation](https://www.nature.com/articles/s41588-021-00801-6) + + + + + diff --git a/version.py b/version.py deleted file mode 100644 index a3bbb9e..0000000 --- a/version.py +++ /dev/null @@ -1,17 +0,0 @@ -import re -from os.path import join - -from setuptools import find_packages - - -def get(): - pkgnames = find_packages() - if len(pkgnames) == 0: - return "unknown" - pkgname = pkgnames[0] - content = open(join(pkgname, "__init__.py")).read() - c = re.compile(r"__version__ *= *('[^']+'|\"[^\"]+\")") - m = c.search(content) - if m is None: - return "unknown" - return m.groups()[0][1:-1]