Source code for xrf_explorer.server.dim_reduction.embedding

import logging

from os.path import isdir, join

import numpy as np

from umap import UMAP

from xrf_explorer.server.dim_reduction.general import (
    valid_element,
    get_path_to_dr_folder,
    create_image_of_indices_to_embedding
)
from xrf_explorer.server.file_system import get_config
from xrf_explorer.server.file_system.cubes import normalize_ndarray_to_grayscale, get_elemental_data_cube

LOG: logging.Logger = logging.getLogger(__name__)



[docs]
def apply_umap(data: np.ndarray, n_neighbors: int, min_dist: float, n_components: int,
               metric: str) -> np.ndarray | None:
    """
    Reduces the dimensionality of the given data using uniform manifold approximation and projection (UMAP).
    The original data is not modified. For more information on UMAP, see: https://umap-learn.readthedocs.io/en/latest/.

    :param data: np.ndarray, shape (n_samples, n_features). The data on which UMAP is used to reduce the dimension of
        features to n_components
    :param n_neighbors: The size of local neighborhood. See UMAP documentation for more information
    :param min_dist: The minimum distance between points in the embedding. See UMAP documentation for more information
    :param n_components: The dimension of the embedded space. See UMAP documentation for more information
    :param metric: The metric to use for distance computation. See UMAP documentation for more information
    :return: np.ndarray, shape (n_samples, n_components) containing the result of UMAP applied to given data with the
        given parameters. If UMAP fails, None is returned
    """

    try:
        embedding: np.ndarray = UMAP(
            n_neighbors=n_neighbors,
            min_dist=min_dist,
            n_components=n_components,
            metric=metric
        ).fit_transform(data)

        return embedding
    except:
        return None




[docs]
def filter_elemental_cube(elemental_cube: np.ndarray, element: int,
                          threshold: int, max_indices: int) -> tuple[np.ndarray, np.ndarray]:
    """
    Get indices for which the value of the given element in the normalized elemental data cube is above the threshold.

    :param elemental_cube: shape (3, m, n) elemental data cube
    :param element: The element to filter on
    :param threshold: The threshold to filter by
    :param max_indices: The maximum number of indices to return
    :return: Indices for which the value of the given element in the normalized elemental data cube is above the
        threshold; the reduced list of indices
    """

    # normalize the elemental map to [0, 255]
    # this is done so the threshold can be applied
    normalized_elemental_map: np.ndarray = normalize_ndarray_to_grayscale(elemental_cube[element])

    # get all indices for which the intensity of the given element is above the threshold
    all_indices: np.ndarray = np.argwhere(normalized_elemental_map >= threshold)

    # check if the number of indices is higher than the configured limit
    # if so, the indices are randomly downsampled
    if all_indices.shape[0] > max_indices:
        LOG.info("Number of data points for dimensionality reduction is higher than the configured limit. "
                 "Points will be randomly downsampled, (%i -> %i)", all_indices.shape[0], max_indices)

        # Use default rng to ensure random selection every time
        reduced_indices = all_indices[np.random.default_rng().choice(all_indices.shape[0], size=max_indices)]

        return all_indices, reduced_indices

    # return the filtered indices
    return all_indices, all_indices




[docs]
def generate_embedding(data_source: str, element: int, threshold: int, new_umap_parameters=None) -> str:
    """
    Generate the embedding (lower dimensional representation of the data) of the elemental data cube using the
    dimensionality reduction method "UMAP". The embedding with the list of indices (which pixels from the elemental data
    cube are in the embedding) are stored in the folder specified in the backend config file. The order the indices
    occur in the indices list is the same order as the positions of the mapped pixels in the embedding.

    :param data_source: The name of the data source to generate the embedding for
    :param element: The element to generate the embedding for
    :param threshold: The threshold to filter the data cube by
    :param new_umap_parameters: The parameters passed on to the UMAP algorithm
    :return: string code indicating the status of the embedding generation. "error" when error occurred, "success" when
        embedding was generated successfully, "downsampled" when successful and the number of data points was
        downsampled
    """
    
    backend_config: dict | None = get_config()  # get the backend config
    dr_folder: str = get_path_to_dr_folder(data_source)  # get path to folder to store the embedding and the indices
    data_cube: np.ndarray = get_elemental_data_cube(data_source)  # get data cube

    if not backend_config or not isdir(dr_folder) or len(data_cube) == 0:
        LOG.error("Failed to load a necessary file")
        return "error"
    elif not valid_element(element, data_cube):
        return "error"

    # get default dim reduction config
    umap_parameters: dict[str, str] = backend_config['dim-reduction']['umap-parameters']

    # update the default parameters with the given parameters
    if new_umap_parameters is not None:
        umap_parameters.update(new_umap_parameters)

    # filter data
    max_samples: int = int(backend_config['dim-reduction']['max-samples'])
    all_indices, reduced_indices = filter_elemental_cube(data_cube, element, threshold, max_samples)
    filtered_data: np.ndarray = data_cube[:, reduced_indices[:, 0], reduced_indices[:, 1]].transpose()

    # compute embedding
    LOG.info(f"Generating embedding with: {{element: {element}, threshold: {threshold}, size: {filtered_data.shape}}}")

    embedded_data: np.ndarray | None = apply_umap(
        filtered_data,
        int(umap_parameters['n-neighbors']),
        float(umap_parameters['min-dist']),
        int(umap_parameters['n-components']),
        umap_parameters['metric']
    )

    if embedded_data is None:
        LOG.error("Failed to compute embedding")
        return "error"

    # save indices and embedded data
    np.save(join(dr_folder, 'indices.npy'), reduced_indices)
    np.save(join(dr_folder, 'all_indices.npy'), all_indices)
    np.save(join(dr_folder, 'embedded_data.npy'), embedded_data)

    # create image of indices to embedding
    create_image_of_indices_to_embedding(data_source)

    LOG.info("Generated embedding successfully")
    if len(all_indices) != len(reduced_indices):
        return "downsampled"
    return "success"