Source code for clep.sample_scoring.limma

# -*- coding: utf-8 -*-

"""Python wrapper for R-based Limma to perform single sample DE analysis."""
import sys
from typing import List

import click
import numpy as np
import pandas as pd
import rpy2.robjects as ro
from rpy2.rinterface_lib.embedded import RRuntimeError
from rpy2.robjects import pandas2ri, Formula
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr
from statsmodels.stats.multitest import multipletests


[docs]def do_limma(data: pd.DataFrame, design: pd.DataFrame, alpha: float, method: str, control: str) -> pd.DataFrame:
    """Perform data manipulation before limma based SS scoring.

    :param data: Dataframe containing the gene expression values
    :param design: Dataframe containing the design table for the data
    :param alpha: Family-wise error rate
    :param method: Method used family-wise error correction
    :param control: label used for representing the control in the design table of the data
    :return: Dataframe containing the Single Sample scores from limma
    """
    label_mapping = {
        key: val
        for val, key in enumerate(np.unique(design['Target']))
    }

    # Get the control patients using the control parameter
    control_filter = (design.Target == control)

    # The data is transposed to get in the same format as the design matrix & then after the filtering transposed
    # again to return to the original format
    ctrl_data = data.transpose()[list(control_filter)].transpose()

    # Here the sample stands for diseased patients
    sample_data = data.transpose()[list(~control_filter)].transpose()

    ctrl_design = design[list(control_filter)]
    sample_design = design[list(~control_filter)]

    # Final output dataframe (patients as rows & genes as columns)
    output_df = pd.DataFrame(columns=sample_data.index, index=sample_data.columns)

    # Loop over every sample to get the individual limma based SS scores
    for col_idx, col in enumerate(sample_data.columns):
        data_df = ctrl_data.copy()

        # Get a single sample from dataframe
        data_df[col] = sample_data.iloc[:, col_idx]

        # Add the design of that sample with the control samples to make the design table
        design_df = ctrl_design.append(sample_design.iloc[col_idx, :], ignore_index=True)

        output = _limma(data=data_df, design=design_df, alpha=alpha, adjust_method=method)

        # Only store the logFC values
        output_df.iloc[col_idx, :] = output['logFC'].values.flatten()

    label = sample_design['Target'].map(label_mapping)
    label.reset_index(drop=True, inplace=True)

    output_df = output_df.apply(_bin).copy()

    output_df['label'] = label.values

    return output_df


def _limma(data: pd.DataFrame, design: pd.DataFrame, alpha: float = 0.05,
           adjust_method: str = 'fdr_bh') -> pd.DataFrame:
    """Wrap limma to perform single sample DE analysis."""
    # Import R libraries
    base = importr('base')
    stats = importr('stats')

    try:
        limma = importr('limma')
    except RRuntimeError as e:
        click.echo(e)
        click.echo("Please check if limma package is installed in R. \n If not, follow the instructions from LINK "
                   "HERE.")
        sys.exit(1)

    # Convert data and design pandas dataframes to R dataframes
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_data = ro.conversion.py2rpy(data)
        r_design = ro.conversion.py2rpy(design)

    # Use the genes index column from data as a R String Vector
    genes = ro.StrVector(
        [
            str(index)
            for index in data.index.tolist()
        ]
    )

    # Create a model matrix using design's Target column using the R formula "~0 + f" to get all the unique factors
    # as columns
    f = base.factor(r_design.rx2('Target'), levels=base.unique(r_design.rx2('Target')))
    form = Formula('~0 + f')
    form.environment['f'] = f
    r_design = stats.model_matrix(form)
    r_design.colnames = base.levels(f)

    # Fit the data to the design using lmFit from limma
    fit = limma.lmFit(r_data, r_design)

    # Make a contrasts matrix with the 1st and the last unique values
    contrast_matrix = limma.makeContrasts(f"{r_design.colnames[0]}-{r_design.colnames[-1]}", levels=r_design)

    # Fit the contrasts matrix to the lmFit data & calculate the bayesian fit
    fit2 = limma.contrasts_fit(fit, contrast_matrix)
    fit2 = limma.eBayes(fit2)

    # topTreat the bayesian fit using the contrasts and add the genelist
    r_output = limma.topTreat(fit2, coef=1, genelist=genes, number=np.Inf)

    # Convert R dataframe to Pandas
    with localconverter(ro.default_converter + pandas2ri.converter):
        output = ro.conversion.rpy2py(r_output)

    # Adjust P value with the provided adjusted method
    output['adj.P.Val'] = multipletests(output['P.Value'], alpha=alpha, method=adjust_method)[1]
    output['logFC'].loc[output['adj.P.Val'] > 0.05] = 0
    output['logFC'].loc[np.abs(output['logFC']) < 1.3] = 0

    return output


def _bin(row: pd.Series) -> List[int]:
    """Replace values greater than 0 as 1 and lesser than 0 as -1."""
    return [
        1 if (val > 0) else (-1 if (val < 0) else 0)
        for val in row
    ]