taps.apps.moldesign.tasks¶

taps/apps/moldesign/tasks.py

MorganFingerprintTransformer ¶

MorganFingerprintTransformer(
    length: int = 256, radius: int = 4
)

Bases: BaseEstimator, TransformerMixin

Class that converts SMILES strings to fingerprint vectors.

Source code in taps/apps/moldesign/tasks.py

def __init__(self, length: int = 256, radius: int = 4) -> None:
    self.length = length
    self.radius = radius

fit ¶

fit(X: Any, y: Any = None) -> Self

Fit the transformer.

Source code in taps/apps/moldesign/tasks.py

def fit(self, X: Any, y: Any = None) -> Self:  # noqa: N803
    """Fit the transformer."""
    return self  # Do need to do anything

transform ¶

transform(X: Any, y: Any = None) -> Any

Compute the fingerprints.

Parameters:

X (Any) –

List of SMILES strings.
y (Any, default: None ) –

Ignored.

Returns:

Any –

Array of fingerprints.

Source code in taps/apps/moldesign/tasks.py

def transform(self, X: Any, y: Any = None) -> Any:  # noqa: N803
    """Compute the fingerprints.

    Args:
        X: List of SMILES strings.
        y: Ignored.

    Returns:
        Array of fingerprints.
    """
    my_func = partial(
        compute_morgan_fingerprints,
        fingerprint_length=self.length,
        fingerprint_radius=self.radius,
    )
    with ProcessPoolExecutor(max_workers=n_workers) as pool:
        fing = list(pool.map(my_func, X, chunksize=2048))
    return numpy.vstack(fing)

generate_initial_xyz ¶

generate_initial_xyz(mol_string: str) -> str

Generate the XYZ coordinates for a molecule.

Parameters:

mol_string (str) –

SMILES string.

Returns:

str –

XYZ coordinates for the molecule.

Source code in taps/apps/moldesign/tasks.py

def generate_initial_xyz(mol_string: str) -> str:
    """Generate the XYZ coordinates for a molecule.

    Args:
        mol_string: SMILES string.

    Returns:
        XYZ coordinates for the molecule.
    """
    # Generate 3D coordinates for the molecule
    mol = Chem.MolFromSmiles(mol_string)
    if mol is None:
        raise ValueError(f'Parse failure for {mol_string}')
    mol = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol, randomSeed=1)
    AllChem.MMFFOptimizeMolecule(mol)

    # Save geometry as 3D coordinates
    xyz = f'{mol.GetNumAtoms()}\n'
    xyz += mol_string + '\n'
    conf = mol.GetConformer()
    for i, a in enumerate(mol.GetAtoms()):
        s = a.GetSymbol()
        c = conf.GetAtomPosition(i)
        xyz += f'{s} {c[0]} {c[1]} {c[2]}\n'

    return xyz

compute_vertical ¶

compute_vertical(smiles: str) -> float

Run the ionization potential computation.

Parameters:

smiles (str) –

SMILES string to evaluate.

Returns:

float –

Ionization energy in Ha.

Source code in taps/apps/moldesign/tasks.py

@task()
def compute_vertical(smiles: str) -> float:
    """Run the ionization potential computation.

    Args:
        smiles: SMILES string to evaluate.

    Returns:
        Ionization energy in Ha.
    """
    # Make the initial geometry
    xyz = generate_initial_xyz(smiles)

    # Make the XTB calculator
    calc = XTB(accuracy=0.05)

    # Parse the molecule
    atoms = read(StringIO(xyz), format='xyz')

    # Compute the neutral geometry. Uses QCEngine
    # (https://github.com/MolSSI/QCEngine) to handle interfaces to XTB.
    atoms.calc = calc
    dyn = LBFGSLineSearch(atoms, logfile=None)
    dyn.run(fmax=0.02, steps=250)

    neutral_energy = atoms.get_potential_energy()

    # Compute the energy of the relaxed geometry in charged form
    charges = numpy.ones((len(atoms),)) * (1 / len(atoms))
    atoms.set_initial_charges(charges)
    charged_energy = atoms.get_potential_energy()

    return charged_energy - neutral_energy

compute_morgan_fingerprints ¶

compute_morgan_fingerprints(
    smiles: str,
    fingerprint_length: int,
    fingerprint_radius: int,
) -> NDArray[bool]

Get Morgan Fingerprint of a specific SMILES string.

Adapted from: https://github.com/google-research/google-research/blob/dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750

Parameters:

smiles (str) –

The molecule as a SMILES string.
fingerprint_length (int) –

Bit-length of fingerprint.
fingerprint_radius (int) –

Radius used to compute fingerprint.

Returns:

NDArray[bool] –

Array with shape [hparams, fingerprint_length] of the Morgan fingerprint.

Source code in taps/apps/moldesign/tasks.py

def compute_morgan_fingerprints(
    smiles: str,
    fingerprint_length: int,
    fingerprint_radius: int,
) -> NDArray[numpy.bool]:
    """Get Morgan Fingerprint of a specific SMILES string.

    Adapted from:
    https://github.com/google-research/google-research/blob/dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750

    Args:
        smiles: The molecule as a SMILES string.
        fingerprint_length: Bit-length of fingerprint.
        fingerprint_radius: Radius used to compute fingerprint.

    Returns:
        Array with shape `[hparams, fingerprint_length]` of the Morgan \
        fingerprint.
    """
    # Parse the molecule
    molecule = Chem.MolFromSmiles(smiles)

    # Compute the fingerprint
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(
        molecule,
        fingerprint_radius,
        fingerprint_length,
    )
    arr = numpy.zeros((1,), dtype=numpy.bool_)

    # ConvertToNumpyArray takes ~ 0.19 ms, while
    # numpy.asarray takes ~ 4.69 ms
    DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr

train_model ¶

train_model(train_data: DataFrame) -> Pipeline

Train a machine learning model using Morgan Fingerprints.

Parameters:

train_data (DataFrame) –

Dataframe with a 'smiles' and 'ie' column that contains molecule structure and property, respectfully.

Returns:

Pipeline –

A trained model.

Source code in taps/apps/moldesign/tasks.py

@task()
def train_model(train_data: pandas.DataFrame) -> Pipeline:
    """Train a machine learning model using Morgan Fingerprints.

    Args:
        train_data: Dataframe with a 'smiles' and 'ie' column
            that contains molecule structure and property, respectfully.

    Returns:
        A trained model.
    """
    model = Pipeline(
        [
            ('fingerprint', MorganFingerprintTransformer()),
            (
                'knn',
                KNeighborsRegressor(
                    n_neighbors=4,
                    weights='distance',
                    metric='jaccard',
                    n_jobs=-1,
                ),
            ),
        ],
    )

    # Ray arrays are immutable so need to clone.
    return model.fit(train_data['smiles'].copy(), train_data['ie'].copy())

run_model ¶

run_model(model: Pipeline, smiles: list[str]) -> DataFrame

Run a model on a list of smiles strings.

Parameters:

model (Pipeline) –

Trained model that takes SMILES strings as inputs.
smiles (list[str]) –

List of molecules to evaluate.

Returns:

DataFrame –

A dataframe with the molecules and their predicted outputs.

Source code in taps/apps/moldesign/tasks.py

@task()
def run_model(model: Pipeline, smiles: list[str]) -> pandas.DataFrame:
    """Run a model on a list of smiles strings.

    Args:
        model: Trained model that takes SMILES strings as inputs.
        smiles: List of molecules to evaluate.

    Returns:
        A dataframe with the molecules and their predicted outputs.
    """
    pred_y = model.predict(smiles)
    return pandas.DataFrame({'smiles': smiles, 'ie': pred_y})

combine_inferences ¶

combine_inferences(*inputs: DataFrame) -> DataFrame

Concatenate a series of inferences into a single DataFrame.

Parameters:

inputs (DataFrame, default: () ) –

A list of the component DataFrames.

Returns:

DataFrame –

A single DataFrame containing the same inferences.

Source code in taps/apps/moldesign/tasks.py

@task()
def combine_inferences(*inputs: pandas.DataFrame) -> pandas.DataFrame:
    """Concatenate a series of inferences into a single DataFrame.

    Args:
        inputs: A list of the component DataFrames.

    Returns:
        A single DataFrame containing the same inferences.
    """
    return pandas.concat(inputs, ignore_index=True)