Source code for iris.analysis.stats

# Copyright Iris contributors
#
# This file is part of Iris and is released under the LGPL license.
# See COPYING and COPYING.LESSER in the root of the repository for full
# licensing details.
"""
Statistical operations between cubes.

"""

import numpy as np
import numpy.ma as ma

import iris
from iris.util import broadcast_to_shape


[docs]def pearsonr( cube_a, cube_b, corr_coords=None, weights=None, mdtol=1.0, common_mask=False, ): """ Calculate the Pearson's r correlation coefficient over specified dimensions. Args: * cube_a, cube_b (cubes): Cubes between which the correlation will be calculated. The cubes should either be the same shape and have the same dimension coordinates or one cube should be broadcastable to the other. * corr_coords (str or list of str): The cube coordinate name(s) over which to calculate correlations. If no names are provided then correlation will be calculated over all common cube dimensions. * weights (numpy.ndarray, optional): Weights array of same shape as (the smaller of) cube_a and cube_b. Note that latitude/longitude area weights can be calculated using :func:`iris.analysis.cartography.area_weights`. * mdtol (float, optional): Tolerance of missing data. The missing data fraction is calculated based on the number of grid cells masked in both cube_a and cube_b. If this fraction exceed mdtol, the returned value in the corresponding cell is masked. mdtol=0 means no missing data is tolerated while mdtol=1 means the resulting element will be masked if and only if all contributing elements are masked in cube_a or cube_b. Defaults to 1. * common_mask (bool): If True, applies a common mask to cube_a and cube_b so only cells which are unmasked in both cubes contribute to the calculation. If False, the variance for each cube is calculated from all available cells. Defaults to False. Returns: A cube of the correlation between the two input cubes along the specified dimensions, at each point in the remaining dimensions of the cubes. For example providing two time/altitude/latitude/longitude cubes and corr_coords of 'latitude' and 'longitude' will result in a time/altitude cube describing the latitude/longitude (i.e. pattern) correlation at each time/altitude point. Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient This operation is non-lazy. """ # Assign larger cube to cube_1 if cube_b.ndim > cube_a.ndim: cube_1 = cube_b cube_2 = cube_a else: cube_1 = cube_a cube_2 = cube_b smaller_shape = cube_2.shape dim_coords_1 = [coord.name() for coord in cube_1.dim_coords] dim_coords_2 = [coord.name() for coord in cube_2.dim_coords] common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2)) # If no coords passed then set to all common dimcoords of cubes. if corr_coords is None: corr_coords = common_dim_coords def _ones_like(cube): # Return a copy of cube with the same mask, but all data values set to 1. # The operation is non-lazy. # For safety we also discard any cell-measures and ancillary-variables, to # avoid cube arithmetic possibly objecting to them, or inadvertently retaining # them in the result where they might be inappropriate. ones_cube = cube.copy() ones_cube.data = np.ones_like(cube.data) ones_cube.rename("unknown") ones_cube.units = 1 for cm in ones_cube.cell_measures(): ones_cube.remove_cell_measure(cm) for av in ones_cube.ancillary_variables(): ones_cube.remove_ancillary_variable(av) return ones_cube # Match up data masks if required. if common_mask: # Create a cube of 1's with a common mask. if ma.is_masked(cube_2.data): mask_cube = _ones_like(cube_2) else: mask_cube = 1.0 if ma.is_masked(cube_1.data): # Take a slice to avoid unnecessary broadcasting of cube_2. slice_coords = [ dim_coords_1[i] for i in range(cube_1.ndim) if dim_coords_1[i] not in common_dim_coords and np.array_equal( cube_1.data.mask.any(axis=i), cube_1.data.mask.all(axis=i) ) ] cube_1_slice = next(cube_1.slices_over(slice_coords)) mask_cube = _ones_like(cube_1_slice) * mask_cube # Apply common mask to data. if isinstance(mask_cube, iris.cube.Cube): cube_1 = cube_1 * mask_cube cube_2 = mask_cube * cube_2 dim_coords_2 = [coord.name() for coord in cube_2.dim_coords] # Broadcast weights to shape of cubes if necessary. if weights is None or cube_1.shape == smaller_shape: weights_1 = weights weights_2 = weights else: if weights.shape != smaller_shape: raise ValueError( "weights array should have dimensions {}".format(smaller_shape) ) dims_1_common = [ i for i in range(cube_1.ndim) if dim_coords_1[i] in common_dim_coords ] weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common) if cube_2.shape != smaller_shape: dims_2_common = [ i for i in range(cube_2.ndim) if dim_coords_2[i] in common_dim_coords ] weights_2 = broadcast_to_shape( weights, cube_2.shape, dims_2_common ) else: weights_2 = weights # Calculate correlations. s1 = cube_1 - cube_1.collapsed( corr_coords, iris.analysis.MEAN, weights=weights_1 ) s2 = cube_2 - cube_2.collapsed( corr_coords, iris.analysis.MEAN, weights=weights_2 ) covar = (s1 * s2).collapsed( corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol ) var_1 = (s1 ** 2).collapsed( corr_coords, iris.analysis.SUM, weights=weights_1 ) var_2 = (s2 ** 2).collapsed( corr_coords, iris.analysis.SUM, weights=weights_2 ) denom = iris.analysis.maths.apply_ufunc( np.sqrt, var_1 * var_2, new_unit=covar.units ) corr_cube = covar / denom corr_cube.rename("Pearson's r") return corr_cube