Source code for iris.analysis.stats

# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Statistical operations between cubes.

"""

import numpy as np
import numpy.ma as ma

import iris
from iris.util import broadcast_to_shape


[docs]def pearsonr(
    cube_a,
    cube_b,
    corr_coords=None,
    weights=None,
    mdtol=1.0,
    common_mask=False,
):
    """
    Calculate the Pearson's r correlation coefficient over specified
    dimensions.

    Args:

    * cube_a, cube_b (cubes):
        Cubes between which the correlation will be calculated.  The cubes
        should either be the same shape and have the same dimension coordinates
        or one cube should be broadcastable to the other.
    * corr_coords (str or list of str):
        The cube coordinate name(s) over which to calculate correlations. If no
        names are provided then correlation will be calculated over all common
        cube dimensions.
    * weights (numpy.ndarray, optional):
        Weights array of same shape as (the smaller of) cube_a and cube_b. Note
        that latitude/longitude area weights can be calculated using
        :func:`iris.analysis.cartography.area_weights`.
    * mdtol (float, optional):
        Tolerance of missing data. The missing data fraction is calculated
        based on the number of grid cells masked in both cube_a and cube_b. If
        this fraction exceed mdtol, the returned value in the corresponding
        cell is masked. mdtol=0 means no missing data is tolerated while
        mdtol=1 means the resulting element will be masked if and only if all
        contributing elements are masked in cube_a or cube_b. Defaults to 1.
    * common_mask (bool):
        If True, applies a common mask to cube_a and cube_b so only cells which
        are unmasked in both cubes contribute to the calculation. If False, the
        variance for each cube is calculated from all available cells. Defaults
        to False.

    Returns:
        A cube of the correlation between the two input cubes along the
        specified dimensions, at each point in the remaining dimensions of the
        cubes.

        For example providing two time/altitude/latitude/longitude cubes and
        corr_coords of 'latitude' and 'longitude' will result in a
        time/altitude cube describing the latitude/longitude (i.e. pattern)
        correlation at each time/altitude point.

    Reference:
        https://en.wikipedia.org/wiki/Pearson_correlation_coefficient

    This operation is non-lazy.

    """

    # Assign larger cube to cube_1
    if cube_b.ndim > cube_a.ndim:
        cube_1 = cube_b
        cube_2 = cube_a
    else:
        cube_1 = cube_a
        cube_2 = cube_b

    smaller_shape = cube_2.shape

    dim_coords_1 = [coord.name() for coord in cube_1.dim_coords]
    dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
    common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2))
    # If no coords passed then set to all common dimcoords of cubes.
    if corr_coords is None:
        corr_coords = common_dim_coords

    def _ones_like(cube):
        # Return a copy of cube with the same mask, but all data values set to 1.
        # The operation is non-lazy.
        # For safety we also discard any cell-measures and ancillary-variables, to
        # avoid cube arithmetic possibly objecting to them, or inadvertently retaining
        # them in the result where they might be inappropriate.
        ones_cube = cube.copy()
        ones_cube.data = np.ones_like(cube.data)
        ones_cube.rename("unknown")
        ones_cube.units = 1
        for cm in ones_cube.cell_measures():
            ones_cube.remove_cell_measure(cm)
        for av in ones_cube.ancillary_variables():
            ones_cube.remove_ancillary_variable(av)
        return ones_cube

    # Match up data masks if required.
    if common_mask:
        # Create a cube of 1's with a common mask.
        if ma.is_masked(cube_2.data):
            mask_cube = _ones_like(cube_2)
        else:
            mask_cube = 1.0
        if ma.is_masked(cube_1.data):
            # Take a slice to avoid unnecessary broadcasting of cube_2.
            slice_coords = [
                dim_coords_1[i]
                for i in range(cube_1.ndim)
                if dim_coords_1[i] not in common_dim_coords
                and np.array_equal(
                    cube_1.data.mask.any(axis=i), cube_1.data.mask.all(axis=i)
                )
            ]
            cube_1_slice = next(cube_1.slices_over(slice_coords))
            mask_cube = _ones_like(cube_1_slice) * mask_cube
        # Apply common mask to data.
        if isinstance(mask_cube, iris.cube.Cube):
            cube_1 = cube_1 * mask_cube
            cube_2 = mask_cube * cube_2
            dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]

    # Broadcast weights to shape of cubes if necessary.
    if weights is None or cube_1.shape == smaller_shape:
        weights_1 = weights
        weights_2 = weights
    else:
        if weights.shape != smaller_shape:
            raise ValueError(
                "weights array should have dimensions {}".format(smaller_shape)
            )

        dims_1_common = [
            i
            for i in range(cube_1.ndim)
            if dim_coords_1[i] in common_dim_coords
        ]
        weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common)
        if cube_2.shape != smaller_shape:
            dims_2_common = [
                i
                for i in range(cube_2.ndim)
                if dim_coords_2[i] in common_dim_coords
            ]
            weights_2 = broadcast_to_shape(
                weights, cube_2.shape, dims_2_common
            )
        else:
            weights_2 = weights

    # Calculate correlations.
    s1 = cube_1 - cube_1.collapsed(
        corr_coords, iris.analysis.MEAN, weights=weights_1
    )
    s2 = cube_2 - cube_2.collapsed(
        corr_coords, iris.analysis.MEAN, weights=weights_2
    )

    covar = (s1 * s2).collapsed(
        corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol
    )
    var_1 = (s1 ** 2).collapsed(
        corr_coords, iris.analysis.SUM, weights=weights_1
    )
    var_2 = (s2 ** 2).collapsed(
        corr_coords, iris.analysis.SUM, weights=weights_2
    )

    denom = iris.analysis.maths.apply_ufunc(
        np.sqrt, var_1 * var_2, new_unit=covar.units
    )
    corr_cube = covar / denom
    corr_cube.rename("Pearson's r")

    return corr_cube