Skip to content

taps.apps.docking.train

Protein docking model training.

Module adapted from ParslDock.

MorganFingerprintTransformer

MorganFingerprintTransformer(
    length: int = 256, radius: int = 4
)

Bases: BaseEstimator, TransformerMixin

Class that converts SMILES strings to fingerprint vectors.

Source code in taps/apps/docking/train.py
def __init__(self, length: int = 256, radius: int = 4):
    self.length = length
    self.radius = radius

fit

fit(
    X: list[str], y: NDArray[bool] | None = None
) -> MorganFingerprintTransformer

Train model.

Parameters:

  • X (list[str]) –

    List of SMILES strings.

  • y (NDArray[bool] | None, default: None ) –

    Array of true fingerprints.

Returns:

Source code in taps/apps/docking/train.py
def fit(
    self,
    X: list[str],  # noqa: N803
    y: NDArray[numpy.bool] | None = None,
) -> MorganFingerprintTransformer:
    """Train model.

    Args:
        X: List of SMILES strings.
        y: Array of true fingerprints.

    Returns:
        The trained model.
    """
    return self  # Don't need to do anything

transform

transform(
    X: list[str], y: NDArray[bool] | None = None
) -> list[NDArray[bool]]

Compute the fingerprints.

Parameters:

  • X (list[str]) –

    List of SMILES strings.

  • y (NDArray[bool] | None, default: None ) –

    Array of true fingerprints.

Returns:

Source code in taps/apps/docking/train.py
def transform(
    self,
    X: list[str],  # noqa: N803
    y: NDArray[numpy.bool] | None = None,
) -> list[NDArray[numpy.bool]]:
    """Compute the fingerprints.

    Args:
        X: List of SMILES strings.
        y: Array of true fingerprints.

    Returns:
        Array of predicted fingerprints.
    """
    fps = []
    for x in X:
        fp = compute_morgan_fingerprints(x, self.length, self.radius)
        fps.append(fp)

    return fps

compute_morgan_fingerprints

compute_morgan_fingerprints(
    smiles: str,
    fingerprint_length: int,
    fingerprint_radius: int,
) -> NDArray[bool]

Get Morgan Fingerprint of a specific SMILES string.

Adapted from: https://github.com/google-research/google-research/blob/> dfac417/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750

Parameters:

  • smiles (str) –

    The molecule as a SMILES string.

  • fingerprint_length (int) –

    Bit-length of fingerprint.

  • fingerprint_radius (int) –

    Radius used to compute fingerprint.

Returns:

  • NDArray[bool]

    Array containing the Morgan fingerprint with shape

  • NDArray[bool]

    [hparams, fingerprint_length].

Source code in taps/apps/docking/train.py
def compute_morgan_fingerprints(
    smiles: str,
    fingerprint_length: int,
    fingerprint_radius: int,
) -> NDArray[numpy.bool]:
    """Get Morgan Fingerprint of a specific SMILES string.

    Adapted from: https://github.com/google-research/google-research/blob/>
    dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750

    Args:
        smiles: The molecule as a SMILES string.
        fingerprint_length: Bit-length of fingerprint.
        fingerprint_radius: Radius used to compute fingerprint.

    Returns:
        Array containing the Morgan fingerprint with shape
        `[hparams, fingerprint_length]`.
    """
    from rdkit import Chem
    from rdkit import DataStructs
    from rdkit.Chem import rdFingerprintGenerator

    # Parse the molecule
    molecule = Chem.MolFromSmiles(smiles)

    # Compute the fingerprint
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(
        radius=fingerprint_radius,
        fpSize=fingerprint_length,
    )
    fingerprint = mfpgen.GetFingerprint(
        molecule,
    )
    arr = numpy.zeros((1,), dtype=bool)

    # ConvertToNumpyArray takes ~ 0.19 ms, while
    # numpy.asarray takes ~ 4.69 ms
    DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr

train_model

train_model(training_data: DataFrame) -> Pipeline

Train a machine learning model using Morgan Fingerprints.

Parameters:

  • training_data (DataFrame) –

    Dataframe with a 'smiles' and 'score' column that contains molecule structure and docking score, respectfully.

Returns:

  • Pipeline

    A trained model.

Source code in taps/apps/docking/train.py
def train_model(training_data: pandas.DataFrame) -> Pipeline:
    """Train a machine learning model using Morgan Fingerprints.

    Args:
        training_data: Dataframe with a 'smiles' and 'score' column
            that contains molecule structure and docking score, respectfully.

    Returns:
        A trained model.
    """
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.pipeline import Pipeline

    model = Pipeline(
        [
            ('fingerprint', MorganFingerprintTransformer()),
            (
                'knn',
                KNeighborsRegressor(
                    n_neighbors=4,
                    weights='distance',
                    metric='jaccard',
                    n_jobs=-1,
                ),
            ),
        ],
    )

    return model.fit(training_data['smiles'], training_data['score'])

run_model

run_model(model: Pipeline, smiles: list[str]) -> DataFrame

Run a model on a list of smiles strings.

Parameters:

  • model (Pipeline) –

    Trained model that takes SMILES strings as inputs.

  • smiles (list[str]) –

    List of molecules to evaluate.

Returns:

  • DataFrame

    A dataframe with the molecules and their predicted outputs

Source code in taps/apps/docking/train.py
def run_model(model: Pipeline, smiles: list[str]) -> pandas.DataFrame:
    """Run a model on a list of smiles strings.

    Args:
        model: Trained model that takes SMILES strings as inputs.
        smiles: List of molecules to evaluate.

    Returns:
        A dataframe with the molecules and their predicted outputs
    """
    import pandas

    pred_y = model.predict(smiles)
    return pandas.DataFrame({'smiles': smiles, 'score': pred_y})