Analysis & Statistics #

Statistics, extraction, overlay, apply, fill, histogram, and plotting.

`pyramids.dataset.ops.analysis.Analysis` #

Mixin providing analysis, statistics, and data extraction operations for Dataset.

Source code in src/pyramids/dataset/ops/analysis.py

class Analysis:
    """Mixin providing analysis, statistics, and data extraction operations for Dataset."""

    def stats(
        self, band: int | None = None, mask: GeoDataFrame | None = None
    ) -> DataFrame:
        """Get statistics of a band [Min, max, mean, std].

        Args:
            band (int, optional):
                Band index. If None, the statistics of all bands will be returned.
            mask (Polygon GeoDataFrame or Dataset, optional):
                GeodataFrame with a geometry of polygon type.

        Returns:
            DataFrame:
                DataFrame wit the stats of each band, the dataframe has the following columns
                [min, max, mean, std], the index of the dataframe is the band names.

                ```text

                                   Min         max        mean       std
                    Band_1  270.369720  270.762299  270.551361  0.154270
                    Band_2  269.611938  269.744751  269.673645  0.043788
                    Band_3  273.641479  274.168823  273.953979  0.198447
                    Band_4  273.991516  274.540344  274.310669  0.205754
                ```

        Notes:
            - The value of the stats will be stored in an xml file by the name of the raster file with the extension of
              .aux.xml.
            - The content of the file will be like the following:

              ```xml

                  <PAMDataset>
                    <PAMRasterBand band="1">
                      <Description>Band_1</Description>
                      <Metadata>
                        <MDI key="RepresentationType">ATHEMATIC</MDI>
                        <MDI key="STATISTICS_MAXIMUM">88</MDI>
                        <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
                        <MDI key="STATISTICS_MINIMUM">0</MDI>
                        <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
                        <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
                      </Metadata>
                    </PAMRasterBand>
                  </PAMDataset>

              ```

        Examples:
            - Get the statistics of all bands in the dataset:

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 10, 10)
              >>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
              >>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
              >>> print(dataset.stats()) # doctest: +SKIP
                           min       max      mean       std
              Band_1  0.006443  0.942943  0.468935  0.266634
              Band_2  0.020377  0.978130  0.477189  0.306864
              Band_3  0.019652  0.992184  0.537215  0.286502
              Band_4  0.011955  0.984313  0.503616  0.295852
              >>> print(dataset.stats(band=1))  # doctest: +SKIP
                           min      max      mean       std
              Band_2  0.020377  0.97813  0.477189  0.306864

              ```

            - Get the statistics of all the bands using a mask polygon.

              - Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2,
                0.2 -0.1] to cover the 4 cells.
              ```python
              >>> from shapely.geometry import Polygon
              >>> import geopandas as gpd
              >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
              >>> print(dataset.stats(mask=mask))  # doctest: +SKIP
                           min       max      mean       std
              Band_1  0.193441  0.702108  0.541478  0.202932
              Band_2  0.281281  0.932573  0.665602  0.239410
              Band_3  0.031395  0.982235  0.493086  0.377608
              Band_4  0.079562  0.930965  0.591025  0.341578

              ```

        """
        dst: Dataset | None = None
        if mask is not None:
            dst = self.crop(mask, touch=True)

        if band is None:
            df = pd.DataFrame(
                index=self.band_names,
                columns=["min", "max", "mean", "std"],
                dtype=np.float32,
            )
            for i in range(self.band_count):
                if mask is not None and dst is not None:
                    df.iloc[i, :] = dst._get_stats(i)
                else:
                    df.iloc[i, :] = self._get_stats(i)
        else:
            df = pd.DataFrame(
                index=[self.band_names[band]],
                columns=["min", "max", "mean", "std"],
                dtype=np.float32,
            )
            if mask is not None and dst is not None:
                df.iloc[0, :] = dst._get_stats(band)
            else:
                df.iloc[0, :] = self._get_stats(band)

        return df

    def _get_stats(self, band: int | None = None) -> list[float]:
        """_get_stats."""
        band_index = band if band is not None else 0
        band_i = self._iloc(band_index)
        try:
            vals = band_i.GetStatistics(True, True)
        except RuntimeError:
            # when the GetStatistics gives an error "RuntimeError: Failed to compute statistics, no valid pixels
            # found in sampling."
            vals = [0]

        if sum(vals) == 0:
            warnings.warn(
                f"Band {band} has no statistics, and the statistics are going to be calculate"
            )
            vals = band_i.ComputeStatistics(False)

        return list(vals)

    def count_domain_cells(self, band: int = 0) -> int:
        """Count cells inside the domain.

        Args:
            band (int):
                Band index. Default is 0.

        Returns:
            int:
                Number of cells.
        """
        arr = self.read_array(band=band)
        domain_count = np.size(arr[:, :]) - np.count_nonzero(
            arr[np.isclose(arr, self.no_data_value[band], rtol=0.001)]
        )
        return int(domain_count)

    def apply(self, func, band: int = 0, inplace: bool = False) -> Dataset:
        """Apply a function to all domain cells.

        - apply method executes a mathematical operation on the raster array.
        - The function is applied to all domain cells at once using vectorized NumPy operations.

        Args:
            func (function):
                Defined function that takes one input (the cell value).
            band (int):
                Band number.
            inplace (bool):
                If True, the original dataset will be modified. If False, a new dataset will be created.
                Default is False.

        Returns:
            Dataset:
                A new Dataset with the function applied. If inplace is True, returns self.

        Examples:
            - Create a dataset from an array filled with values between -1 and 1:

              ```python
              >>> import numpy as np
              >>> arr = np.random.uniform(-1, 1, size=(5, 5))
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> print(dataset.read_array()) # doctest: +SKIP
              [[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
               [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
               [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
               [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
               [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

              ```

            - Apply the absolute function to the dataset:

              ```python
              >>> abs_dataset = dataset.apply(np.abs)
              >>> print(abs_dataset.read_array()) # doctest: +SKIP
              [[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
               [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
               [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
               [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
               [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

              ```
        """
        if not callable(func):
            raise TypeError("The second argument should be a function")

        no_data_value = self.no_data_value[band]
        src_array = self.read_array(band)
        dtype = self.gdal_dtype[band]

        new_array = np.full((self.rows, self.columns), no_data_value, dtype=src_array.dtype)
        domain_mask = ~np.isclose(src_array, no_data_value, rtol=0.001)
        domain_values = src_array[domain_mask]
        try:
            new_array[domain_mask] = func(domain_values)
        except (ValueError, TypeError):
            new_array[domain_mask] = np.vectorize(func)(domain_values)

        dst_obj = type(self)._build_dataset(
            self.columns, self.rows, 1, dtype, self.geotransform, self.crs, no_data_value
        )
        dst_obj.raster.GetRasterBand(1).WriteArray(new_array)

        if inplace:
            self._update_inplace(dst_obj.raster)
            return self
        return dst_obj

    def fill(
        self, value: float | int, inplace: bool = False, path: str | Path | None = None
    ) -> Dataset:
        """Fill the domain cells with a certain value.

            Fill takes a raster and fills it with one value

        Args:
            value (float | int):
                Numeric value to fill.
            inplace (bool):
                If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.
            path (str):
                Path including the extension (.tif).

        Returns:
            Dataset:
                A new Dataset with cells filled. If inplace is True, returns self.

        Examples:
            - Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

              ```python
              >>> import numpy as np
              >>> arr = np.random.randint(1, 5, size=(5, 5))
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> print(dataset.read_array()) # doctest: +SKIP
              [[1 1 3 1 2]
               [2 2 2 1 2]
               [2 2 3 1 3]
               [3 4 3 3 4]
               [4 4 2 1 1]]
              >>> new_dataset = dataset.fill(10)
              >>> print(new_dataset.read_array())
              [[10 10 10 10 10]
               [10 10 10 10 10]
               [10 10 10 10 10]
               [10 10 10 10 10]
               [10 10 10 10 10]]

              ```
        """
        no_data_value = self.no_data_value[0]
        src_array = self.raster.ReadAsArray()

        if no_data_value is None:
            no_data_value = np.nan

        if not np.isnan(no_data_value):
            src_array[~np.isclose(src_array, no_data_value, rtol=0.000001)] = value
        else:
            src_array[~np.isnan(src_array)] = value

        dst = type(self).dataset_like(self, src_array, path=path)
        if inplace:
            self._update_inplace(dst.raster)
            return self
        return dst

    def extract(
        self,
        band: int | None = None,
        exclude_value: Any | None = None,
        mask: FeatureCollection | GeoDataFrame | None = None,
    ) -> np.ndarray:
        """Extract.

        - Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
        - If the mask parameter is given, the raster will be clipped to the extent of the given mask and the
          values within the mask are extracted.

        Args:
            band (int, optional):
                Band index. Default is None.
            exclude_value (Numeric, optional):
                Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value`
                will be filtered out from the first band only.
            mask (FeatureCollection | GeoDataFrame, optional):
                Vector data containing point geometries at which to extract the values. Default is None.

        Returns:
            np.ndarray:
                The extracted values from each band in the dataset will be in one row in the returned array.

        Examples:
            - Extract all values from the dataset:

              - First, create a dataset with 2 bands, 4 rows and 4 columns:

                ```python
                >>> import numpy as np
                >>> arr = np.random.randint(1, 5, size=(2, 4, 4))
                >>> top_left_corner = (0, 0)
                >>> cell_size = 0.05
                >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
                >>> print(dataset)
                <BLANKLINE>
                            Cell size: 0.05
                            Dimension: 4 * 4
                            EPSG: 4326
                            Number of Bands: 2
                            Band names: ['Band_1', 'Band_2']
                            Mask: -9999.0
                            Data type: int32
                            File:...
                <BLANKLINE>
                >>> print(dataset.read_array()) # doctest: +SKIP
                [[[1 3 3 4]
                  [1 4 2 4]
                  [2 4 2 1]
                  [1 3 2 3]]
                 [[3 2 1 3]
                  [4 3 2 2]
                  [2 2 3 4]
                  [1 4 1 4]]]

                ```

              - Now, extract the values in the dataset:

                ```python
                >>> values = dataset.extract()
                >>> print(values) # doctest: +SKIP
                [[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
                 [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

                ```

              - Extract all the values except 2:

                ```python
                >>> values = dataset.extract(exclude_value=2)
                >>> print(values) # doctest: +SKIP

                ```

            - Extract values at the location of the given point geometries:

              ```python
              >>> import geopandas as gpd
              >>> from shapely.geometry import Point
              ```

              - Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

                ```python
                >>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
                >>> values = dataset.extract(mask=points)
                >>> print(values) # doctest: +SKIP
                [[4 3 3 4]
                 [3 4 4 2]]

                ```
        """
        # Optimize: make the read_array return only the array for inside the mask feature, and not to read the whole
        #  raster
        arr = self.read_array(band=band)
        no_data_value = (
            self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
        )
        if mask is None:
            exclude_list = (
                [no_data_value, exclude_value]
                if exclude_value is not None
                else [no_data_value]
            )
            values = get_pixels2(arr, exclude_list)
        else:
            indices = self.map_to_array_coordinates(mask)
            if arr.ndim > 2:
                values = arr[:, indices[:, 0], indices[:, 1]]
            else:
                values = arr[indices[:, 0], indices[:, 1]]

        return np.asarray(values)

    def overlay(
        self: Dataset,
        classes_map,
        band: int = 0,
        exclude_value: float | int | None = None,
    ) -> dict[list[float], list[float]]:
        """Overlay.

        Overlay method extracts all the values in the dataset for each class in the given class map.

        Args:
            classes_map (Dataset):
                Dataset object for the raster that has classes you want to overlay with the raster.
            band (int):
                If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.
            exclude_value (Numeric, optional):
                Values you want to exclude from extracted values. Default is None.

        Returns:
            Dict:
                Dictionary with class values as keys (from the class map), and for each key a list of all the intersected
                values in the base map.

        Examples:
            - Read the dataset:

              ```python
              >>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
              >>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

              ```

              ![rhine-rainfall](./../../_images/dataset/rhine-rainfall.png)

            - Read the classes dataset:

              ```python
              >>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
              >>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

              ```

              ![rhine-classes](./../../_images/dataset/rhine-classes.png)

            - Overlay the dataset with the classes dataset:

              ```python
              >>> classes_dict = dataset.overlay(classes)
              >>> print(classes_dict.keys()) # doctest: +SKIP
              dict_keys([1, 2, 3, 4, 5])

              ```

            - You can use the key `1` to get the values that overlay class 1.
        """
        if not self._check_alignment(classes_map):
            raise AlignmentError(
                "The class Dataset is not aligned with the current raster, please use the method "
                "'align' to align both rasters."
            )
        arr = self.read_array(band=band)
        no_data_value = (
            self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
        )
        mask = (
            [no_data_value, exclude_value]
            if exclude_value is not None
            else [no_data_value]
        )
        ind = get_indices2(arr, mask)
        classes = classes_map.read_array()
        values: dict[Any, list[Any]] = dict()

        # extract values
        for i, ind_i in enumerate(ind):
            # first check if the sub-basin has a list in the dict if not create a list
            key = classes[ind_i[0], ind_i[1]]
            if key not in list(values.keys()):
                values[key] = list()

            values[key].append(arr[ind_i[0], ind_i[1]])

        return values

    def get_mask(self, band: int = 0) -> np.ndarray:
        """Get the mask array.

        Args:
            band (int):
                Band index. Default is 0.

        Returns:
            np.ndarray:
                Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.
        """
        # TODO: there is a CreateMaskBand method in the gdal.Dataset class, it creates a mask band for the dataset
        #   either internally or externally.
        arr = np.asarray(self._iloc(band).GetMaskBand().ReadAsArray())
        return arr

    def footprint(
        self: Dataset,
        band: int = 0,
        exclude_values: list[Any] | None = None,
    ) -> GeoDataFrame | None:
        """Extract the real coverage of the values in a certain band.

        Args:
            band (int):
                Band index. Default is 0.
            exclude_values (List[Any] | None):
                If you want to exclude a certain value in the raster with another value inter the two values as a
                list of tuples a [(value_to_be_exclude_valuesd, new_value)].

                - Example of exclude_values usage:

                  ```python
                  >>> exclude_values = [0]

                  ```

                - This parameter is introduced particularly in the case of rasters that has the no_data_value stored in
                  the `no_data_value` property does not match the value stored in the band, so this option can correct
                  this behavior.

        Returns:
            GeoDataFrame:
                - geodataframe containing the polygon representing the extent of the raster. the extent column should
                  contain a value of 2 only.
                - if the dataset had separate polygons, each polygon will be in a separate row.

        Examples:
            - The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with
              zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

              ```python
              >>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
              >>> dataset.plot()
              (<Figure size 800x800 with 2 Axes>, <Axes: >)

              ```

            ![dataset-footprint-rhine-flood](./../../_images/dataset/dataset-footprint-rhine-flood.png)

            - Now, to extract the footprint of the dataset band, we need to specify the `exclude_values` parameter with the
              value of the non-flooded cells.

              ```python
              >>> extent = dataset.footprint(band=0, exclude_values=[0])
              >>> print(extent)
                 Band_1                                           geometry
              0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
              1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
              2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
              3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
              4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
              5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
              6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

              >>> extent.plot()
              <Axes: >

              ```

            ![dataset-footprint-rhine-flood-extent](./../../_images/dataset/dataset-footprint-rhine-flood-extent.png)

        """
        arr = self.read_array(band=band)
        no_data_val = self.no_data_value[band]

        if no_data_val is None:
            if not (np.isnan(arr)).any():
                self.logger.warning(
                    "The nodata value stored in the raster does not exist in the raster "
                    "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                    " not correct"
                )
        else:
            if not (np.isclose(arr, no_data_val, rtol=0.00001)).any():
                self.logger.warning(
                    "the nodata value stored in the raster does not exist in the raster "
                    "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                    " not correct"
                )
        # if you want to exclude_values any value in the raster
        if exclude_values:
            for val in exclude_values:
                try:
                    # in case the val2 is None, and the array is int type, the following line will give error as None
                    # is considered as float
                    arr[np.isclose(arr, val)] = no_data_val
                except TypeError:
                    arr = arr.astype(np.float32)
                    arr[np.isclose(arr, val)] = no_data_val

        # replace all the values with 2
        if no_data_val is None:
            # check if the whole raster is full of no_data_value
            if (np.isnan(arr)).all():
                self.logger.warning("the raster is full of no_data_value")
                return None

            arr[~np.isnan(arr)] = 2
        else:
            # check if the whole raster is full of no_data_value
            if (np.isclose(arr, no_data_val, rtol=0.00001)).all():
                self.logger.warning("the raster is full of no_data_value")
                return None

            arr[~np.isclose(arr, no_data_val, rtol=0.00001)] = 2
        new_dataset = self.create_from_array(
            arr, geo=self.geotransform, epsg=self.epsg, no_data_value=self.no_data_value
        )
        # then convert the raster into polygon
        gdf = new_dataset.cluster2(band=band)
        gdf.rename(columns={"Band_1": self.band_names[band]}, inplace=True)

        return gdf

    @staticmethod
    def normalize(array: np.ndarray) -> np.ndarray:
        """Normalize numpy arrays into scale 0.0-1.0.

        Args:
            array (np.ndarray): Numpy array to normalize.

        Returns:
            np.ndarray: Normalized array.
        """
        array_min = array.min()
        array_max = array.max()
        val = (array - array_min) / (array_max - array_min)
        return np.asarray(val)

    @staticmethod
    def _rescale(array: np.ndarray, min_value: float, max_value: float) -> np.ndarray:
        val = (array - min_value) / (max_value - min_value)
        return val

    def get_histogram(
        self: Dataset,
        band: int = 0,
        bins: int = 6,
        min_value: float | None = None,
        max_value: float | None = None,
        include_out_of_range: bool = False,
        approx_ok: bool = False,
    ) -> tuple[list, list[tuple[Any, Any]]]:
        """Get histogram.

        Args:
            band (int, optional):
                Band index. Default is 1.
            bins (int, optional):
                Number of bins. Default is 6.
            min_value (float, optional):
                Minimum value. Default is None.
            max_value (float, optional):
                Maximum value. Default is None.
            include_out_of_range (bool, optional):
                If True, add out-of-range values into the first and last buckets. Default is False.
            approx_ok (bool, optional):
                If True, compute an approximate histogram by using subsampling or overviews. Default is False.

        Returns:
            tuple[list, list[tuple[Any, Any]]]:
                Histogram values and bin edges.

        Hint:
            - The value of the histogram will be stored in an xml file by the name of the raster file with the extension
                of .aux.xml.

            - The content of the file will be like the following:
              ```xml

                  <PAMDataset>
                    <PAMRasterBand band="1">
                      <Description>Band_1</Description>
                      <Histograms>
                        <HistItem>
                          <HistMin>0</HistMin>
                          <HistMax>88</HistMax>
                          <BucketCount>6</BucketCount>
                          <IncludeOutOfRange>0</IncludeOutOfRange>
                          <Approximate>0</Approximate>
                          <HistCounts>75|6|0|4|2|1</HistCounts>
                        </HistItem>
                      </Histograms>
                    </PAMRasterBand>
                  </PAMDataset>

              ```

        Examples:
            - Create `Dataset` consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

              ```python
              >>> import numpy as np
              >>> arr = np.random.randint(1, 12, size=(10, 10))
              >>> print(arr)    # doctest: +SKIP
              [[ 4  1  1  2  6  9  2  5  1  8]
               [ 1 11  5  6  2  5  4  6  6  7]
               [ 5  2 10  4  8 11  4 11 11  1]
               [ 2  3  6  3  1  5 11 10 10  7]
               [ 8  2 11  3  1  3  5  4 10 10]
               [ 1  2  1  6 10  3  6  4  2  8]
               [ 9  5  7  9  7  8  1 11  4  4]
               [ 7  7  2  2  5  3  7  2  9  9]
               [ 2 10  3  2  1 11  5  9  8 11]
               [ 1  5  6 11  3  3  8  1  2  1]]
               >>> top_left_corner = (0, 0)
               >>> cell_size = 0.05
               >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

               ```

            - Now, let's get the histogram of the first band using the `get_histogram` method with the default
                parameters:
                ```python
                >>> hist, ranges = dataset.get_histogram(band=0)
                >>> print(hist)  # doctest: +SKIP
                [28, 17, 10, 15, 13, 7]
                >>> print(ranges)   # doctest: +SKIP
                [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

                ```
            - we can also exclude values from the histogram by using the `min_value` and `max_value`:
                ```python
                >>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
                >>> print(hist)  # doctest: +SKIP
                [10, 8, 7, 7, 6, 0]
                >>> print(ranges)   # doctest: +SKIP
                [(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

                ```
            - For datasets with big dimensions, computing the histogram can take some time; approximating the computation
                of the histogram can save a lot of computation time. When using the parameter `approx_ok` with a `True`
                value the histogram will be calculated from resampling the band or from the overviews if they exist.
                ```python
                >>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
                >>> print(hist)  # doctest: +SKIP
                [28, 17, 10, 15, 13, 7]
                >>> print(ranges)   # doctest: +SKIP
                [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

                ```
            - As you see for small datasets, the approximation of the histogram will be the same as without approximation.

        """
        band_obj = self._iloc(band)
        min_val, max_val = band_obj.ComputeRasterMinMax()
        if min_value is None:
            min_value = min_val
        if max_value is None:
            max_value = max_val

        bin_width = (max_value - min_value) / bins
        ranges = [
            (min_val + i * bin_width, min_val + (i + 1) * bin_width)
            for i in range(bins)
        ]

        hist = band_obj.GetHistogram(
            min=min_value,
            max=max_value,
            buckets=bins,
            include_out_of_range=include_out_of_range,
            approx_ok=approx_ok,
        )
        return hist, ranges
    def plot(
        self,
        band: int | None = None,
        exclude_value: Any | None = None,
        rgb: list[int] | None = None,
        surface_reflectance: int | None = None,
        cutoff: list | None = None,
        overview: bool | None = False,
        overview_index: int | None = 0,
        percentile: int | None = None,
        **kwargs: Any,
    ) -> ArrayGlyph:
        """Plot the values/overviews of a given band.
        The plot function uses the `cleopatra` as a backend to plot the raster data, for more information check
        [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/#cleopatra.array_glyph.ArrayGlyph.plot).
        Args:
            band (int, optional):
                The band you want to get its data. Default is 0.
            exclude_value (Any, optional):
                Value to exclude from the plot. Default is None.
            rgb (List[int], optional):
                The indices of the red, green, and blue bands in the `Dataset`. the `rgb` parameter can be a list of
                three values, or a list of four values if the alpha band is also included.
                The `plot` method will check if the rgb bands are defined in the `Dataset`, if all the three bands (
                red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands
                will be considered as [2,1,0] as the default order for sentinel tif files.
            surface_reflectance (int, optional):
                Surface reflectance value for normalizing satellite data, by default None.
                Typically 10000 for Sentinel-2 data.
            cutoff (List, optional):
                clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff
                and scale them back to between 0 and 1). Default is None.
            overview (bool, optional):
                True if you want to plot the overview. Default is False.
            overview_index (int, optional):
                Index of the overview. Default is 0.
            percentile: int
                The percentile value to be used for scaling.
        kwargs:
                | Parameter                   | Type                | Description |
                |-----------------------------|---------------------|-------------|
                | `points`                    | array               | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. |
                | `point_color`               | str                 | Color of the point. |
                | `point_size`                | Any                 | Size of the point. |
                | `pid_color`                 | str                 | Color of the annotation of the point. Default is blue. |
                | `pid_size`                  | Any                 | Size of the point annotation. |
                | `figsize`                   | tuple, optional     | Figure size. Default is `(8, 8)`. |
                | `title`                     | str, optional       | Title of the plot. Default is `'Total Discharge'`. |
                | `title_size`                | int, optional       | Title size. Default is `15`. |
                | `orientation`               | str, optional       | Orientation of the color bar (`horizontal` or `vertical`). Default is `'vertical'`. |
                | `rotation`                  | number, optional    | Rotation of the color bar label. Default is `-90`. |
                | `cbar_length`               | float, optional     | Ratio to control the height of the color bar. Default is `0.75`. |
                | `ticks_spacing`             | int, optional       | Spacing between color bar ticks. Default is `2`. |
                | `cbar_label_size`           | int, optional       | Size of the color bar label. Default is `12`. |
                | `cbar_label`                | str, optional       | Label of the color bar. Default is `'Discharge m\u00b3/s'`. |
                | `color_scale`               | int, optional       | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is `1`. |
                | `gamma`                     | float, optional     | Value needed for color scale option 2. Default is `1/2`. |
                | `line_threshold`            | float, optional     | Value needed for color scale option 3. Default is `0.0001`. |
                | `line_scale`                | float, optional     | Value needed for color scale option 3. Default is `0.001`. |
                | `bounds`                    | list, optional      | Discrete bounds for color scale option 4. Default is `None`. |
                | `midpoint`                  | float, optional     | Value needed for color scale option 5. Default is `0`. |
                | `cmap`                      | str, optional       | Color map style. Default is `'coolwarm_r'`. |
                | `display_cell_value`        | bool, optional      | Whether to display cell values as text. |
                | `num_size`                  | int, optional       | Size of numbers plotted on top of each cell. Default is `8`. |
                | `background_color_threshold`| float or int, optional | Threshold for deciding text color over cells: if value > threshold -> black text; else white text. If `None`, max value / 2 is used. Default is `None`. |
        Returns:
            ArrayGlyph:
                ArrayGlyph object. For more details of the ArrayGlyph object check the [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/).
        Examples:
            - Plot a certain band:
              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 10, 10)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
              >>> dataset.plot(band=0)
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using power scale.
              ```python
              >>> dataset.plot(band=0, color_scale="power")
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using SymLogNorm scale.
              ```python
              >>> dataset.plot(band=0, color_scale="sym-lognorm")
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using PowerNorm scale.
              ```python
              >>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using BoundaryNorm scale.
              ```python
              >>> dataset.plot(band=0, color_scale="midpoint")
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
        """
        import_cleopatra(
            "The current function uses cleopatra package to for plotting, please install it manually, for more info "
            "check https://github.com/serapeum-org/cleopatra"
        )
        from cleopatra.array_glyph import ArrayGlyph
        no_data_value = [np.nan if i is None else i for i in self.no_data_value]
        if overview:
            arr = self.read_overview_array(
                band=band,
                overview_index=overview_index if overview_index is not None else 0,
            )
        else:
            arr = self.read_array(band=band)
        # if the raster has three bands or more.
        if self.band_count >= 3:
            if band is None:
                if rgb is None:
                    rgb_candidate: list[int | None] = [
                        self.get_band_by_color("red"),
                        self.get_band_by_color("green"),
                        self.get_band_by_color("blue"),
                    ]
                    if None in rgb_candidate:
                        rgb = [2, 1, 0]
                    else:
                        rgb = [int(v) for v in rgb_candidate if v is not None]
                # first make the band index the first band in the rgb list (red band)
                band = rgb[0]
        # elif self.band_count == 1:
        #     band = 0
        else:
            if band is None:
                band = 0
        exclude_value = (
            [no_data_value[band], exclude_value]
            if exclude_value is not None
            else [no_data_value[band]]
        )
        cleo = ArrayGlyph(
            arr,
            exclude_value=exclude_value,
            extent=self.bbox,
            rgb=rgb,
            surface_reflectance=surface_reflectance,
            cutoff=cutoff,
            percentile=percentile,
            **kwargs,
        )
        cleo.plot(**kwargs)
        return cleo
    @staticmethod
    def _process_color_table(color_table: DataFrame) -> DataFrame:
        import_cleopatra(
            "The current function uses cleopatra package to for plotting, please install it manually, for more info"
            " check https://github.com/serapeum-org/cleopatra"
        )
        from cleopatra.colors import Colors
        # if the color_table does not contain the red, green, and blue columns, assume it has one column with
        # the color as hex and then, convert the color to rgb.
        if all(elem in color_table.columns for elem in ["red", "green", "blue"]):
            color_df = color_table.loc[:, ["values", "red", "green", "blue"]]
        elif "color" in color_table.columns:
            color = Colors(color_table["color"].tolist())
            color_rgb = color.to_rgb(normalized=False)
            color_df = DataFrame(columns=["values"])
            color_df["values"] = color_table["values"].to_list()
            color_df.loc[:, ["red", "green", "blue"]] = color_rgb
        else:
            raise ValueError(
                f"color_table must contain either red, green, blue, or color columns. given columns are: "
                f"{color_table.columns}"
            )
        if "alpha" not in color_table.columns:
            color_df.loc[:, "alpha"] = 255
        else:
            color_df.loc[:, "alpha"] = color_table["alpha"]
        return color_df

`stats(band=None, mask=None)` #

Get statistics of a band [Min, max, mean, std].

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. If None, the statistics of all bands will be returned.	`None`
`mask`	`Polygon GeoDataFrame or Dataset`	GeodataFrame with a geometry of polygon type.	`None`

Returns:

Name Type Description

DataFrame

DataFrame wit the stats of each band, the dataframe has the following columns [min, max, mean, std], the index of the dataframe is the band names.

                   Min         max        mean       std
    Band_1  270.369720  270.762299  270.551361  0.154270
    Band_2  269.611938  269.744751  269.673645  0.043788
    Band_3  273.641479  274.168823  273.953979  0.198447
    Band_4  273.991516  274.540344  274.310669  0.205754

Notes

The value of the stats will be stored in an xml file by the name of the raster file with the extension of .aux.xml.
The content of the file will be like the following:

    <PAMDataset>
      <PAMRasterBand band="1">
        <Description>Band_1</Description>
        <Metadata>
          <MDI key="RepresentationType">ATHEMATIC</MDI>
          <MDI key="STATISTICS_MAXIMUM">88</MDI>
          <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
          <MDI key="STATISTICS_MINIMUM">0</MDI>
          <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
          <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
        </Metadata>
      </PAMRasterBand>
    </PAMDataset>

Examples:

Get the statistics of all bands in the dataset:

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
>>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
>>> print(dataset.stats()) # doctest: +SKIP
             min       max      mean       std
Band_1  0.006443  0.942943  0.468935  0.266634
Band_2  0.020377  0.978130  0.477189  0.306864
Band_3  0.019652  0.992184  0.537215  0.286502
Band_4  0.011955  0.984313  0.503616  0.295852
>>> print(dataset.stats(band=1))  # doctest: +SKIP
             min      max      mean       std
Band_2  0.020377  0.97813  0.477189  0.306864

Get the statistics of all the bands using a mask polygon.

Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2 -0.1] to cover the 4 cells.

>>> from shapely.geometry import Polygon
>>> import geopandas as gpd
>>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
>>> print(dataset.stats(mask=mask))  # doctest: +SKIP
             min       max      mean       std
Band_1  0.193441  0.702108  0.541478  0.202932
Band_2  0.281281  0.932573  0.665602  0.239410
Band_3  0.031395  0.982235  0.493086  0.377608
Band_4  0.079562  0.930965  0.591025  0.341578

Source code in src/pyramids/dataset/ops/analysis.py

def stats(
    self, band: int | None = None, mask: GeoDataFrame | None = None
) -> DataFrame:
    """Get statistics of a band [Min, max, mean, std].

    Args:
        band (int, optional):
            Band index. If None, the statistics of all bands will be returned.
        mask (Polygon GeoDataFrame or Dataset, optional):
            GeodataFrame with a geometry of polygon type.

    Returns:
        DataFrame:
            DataFrame wit the stats of each band, the dataframe has the following columns
            [min, max, mean, std], the index of the dataframe is the band names.

            ```text

                               Min         max        mean       std
                Band_1  270.369720  270.762299  270.551361  0.154270
                Band_2  269.611938  269.744751  269.673645  0.043788
                Band_3  273.641479  274.168823  273.953979  0.198447
                Band_4  273.991516  274.540344  274.310669  0.205754
            ```

    Notes:
        - The value of the stats will be stored in an xml file by the name of the raster file with the extension of
          .aux.xml.
        - The content of the file will be like the following:

          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <Description>Band_1</Description>
                  <Metadata>
                    <MDI key="RepresentationType">ATHEMATIC</MDI>
                    <MDI key="STATISTICS_MAXIMUM">88</MDI>
                    <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
                    <MDI key="STATISTICS_MINIMUM">0</MDI>
                    <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
                    <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
                  </Metadata>
                </PAMRasterBand>
              </PAMDataset>

          ```

    Examples:
        - Get the statistics of all bands in the dataset:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
          >>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
          >>> print(dataset.stats()) # doctest: +SKIP
                       min       max      mean       std
          Band_1  0.006443  0.942943  0.468935  0.266634
          Band_2  0.020377  0.978130  0.477189  0.306864
          Band_3  0.019652  0.992184  0.537215  0.286502
          Band_4  0.011955  0.984313  0.503616  0.295852
          >>> print(dataset.stats(band=1))  # doctest: +SKIP
                       min      max      mean       std
          Band_2  0.020377  0.97813  0.477189  0.306864

          ```

        - Get the statistics of all the bands using a mask polygon.

          - Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2,
            0.2 -0.1] to cover the 4 cells.
          ```python
          >>> from shapely.geometry import Polygon
          >>> import geopandas as gpd
          >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
          >>> print(dataset.stats(mask=mask))  # doctest: +SKIP
                       min       max      mean       std
          Band_1  0.193441  0.702108  0.541478  0.202932
          Band_2  0.281281  0.932573  0.665602  0.239410
          Band_3  0.031395  0.982235  0.493086  0.377608
          Band_4  0.079562  0.930965  0.591025  0.341578

          ```

    """
    dst: Dataset | None = None
    if mask is not None:
        dst = self.crop(mask, touch=True)

    if band is None:
        df = pd.DataFrame(
            index=self.band_names,
            columns=["min", "max", "mean", "std"],
            dtype=np.float32,
        )
        for i in range(self.band_count):
            if mask is not None and dst is not None:
                df.iloc[i, :] = dst._get_stats(i)
            else:
                df.iloc[i, :] = self._get_stats(i)
    else:
        df = pd.DataFrame(
            index=[self.band_names[band]],
            columns=["min", "max", "mean", "std"],
            dtype=np.float32,
        )
        if mask is not None and dst is not None:
            df.iloc[0, :] = dst._get_stats(band)
        else:
            df.iloc[0, :] = self._get_stats(band)

    return df

`count_domain_cells(band=0)` #

Count cells inside the domain.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 0.	`0`

Returns:

Name	Type	Description
`int`	`int`	Number of cells.

Source code in src/pyramids/dataset/ops/analysis.py

def count_domain_cells(self, band: int = 0) -> int:
    """Count cells inside the domain.

    Args:
        band (int):
            Band index. Default is 0.

    Returns:
        int:
            Number of cells.
    """
    arr = self.read_array(band=band)
    domain_count = np.size(arr[:, :]) - np.count_nonzero(
        arr[np.isclose(arr, self.no_data_value[band], rtol=0.001)]
    )
    return int(domain_count)

`apply(func, band=0, inplace=False)` #

Apply a function to all domain cells.

apply method executes a mathematical operation on the raster array.
The function is applied to all domain cells at once using vectorized NumPy operations.

Parameters:

Name	Type	Description	Default
`func`	`function`	Defined function that takes one input (the cell value).	required
`band`	`int`	Band number.	`0`
`inplace`	`bool`	If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.	`False`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	A new Dataset with the function applied. If inplace is True, returns self.

Examples:

Create a dataset from an array filled with values between -1 and 1:

>>> import numpy as np
>>> arr = np.random.uniform(-1, 1, size=(5, 5))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array()) # doctest: +SKIP
[[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
 [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
 [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
 [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
 [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

Apply the absolute function to the dataset:

>>> abs_dataset = dataset.apply(np.abs)
>>> print(abs_dataset.read_array()) # doctest: +SKIP
[[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
 [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
 [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
 [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
 [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

Source code in src/pyramids/dataset/ops/analysis.py

def apply(self, func, band: int = 0, inplace: bool = False) -> Dataset:
    """Apply a function to all domain cells.

    - apply method executes a mathematical operation on the raster array.
    - The function is applied to all domain cells at once using vectorized NumPy operations.

    Args:
        func (function):
            Defined function that takes one input (the cell value).
        band (int):
            Band number.
        inplace (bool):
            If True, the original dataset will be modified. If False, a new dataset will be created.
            Default is False.

    Returns:
        Dataset:
            A new Dataset with the function applied. If inplace is True, returns self.

    Examples:
        - Create a dataset from an array filled with values between -1 and 1:

          ```python
          >>> import numpy as np
          >>> arr = np.random.uniform(-1, 1, size=(5, 5))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array()) # doctest: +SKIP
          [[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
           [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
           [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
           [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
           [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

          ```

        - Apply the absolute function to the dataset:

          ```python
          >>> abs_dataset = dataset.apply(np.abs)
          >>> print(abs_dataset.read_array()) # doctest: +SKIP
          [[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
           [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
           [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
           [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
           [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

          ```
    """
    if not callable(func):
        raise TypeError("The second argument should be a function")

    no_data_value = self.no_data_value[band]
    src_array = self.read_array(band)
    dtype = self.gdal_dtype[band]

    new_array = np.full((self.rows, self.columns), no_data_value, dtype=src_array.dtype)
    domain_mask = ~np.isclose(src_array, no_data_value, rtol=0.001)
    domain_values = src_array[domain_mask]
    try:
        new_array[domain_mask] = func(domain_values)
    except (ValueError, TypeError):
        new_array[domain_mask] = np.vectorize(func)(domain_values)

    dst_obj = type(self)._build_dataset(
        self.columns, self.rows, 1, dtype, self.geotransform, self.crs, no_data_value
    )
    dst_obj.raster.GetRasterBand(1).WriteArray(new_array)

    if inplace:
        self._update_inplace(dst_obj.raster)
        return self
    return dst_obj

`fill(value, inplace=False, path=None)` #

Fill the domain cells with a certain value.

Fill takes a raster and fills it with one value

Parameters:

Name	Type	Description	Default
`value`	`float \| int`	Numeric value to fill.	required
`inplace`	`bool`	If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.	`False`
`path`	`str`	Path including the extension (.tif).	`None`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	A new Dataset with cells filled. If inplace is True, returns self.

Examples:

Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.randint(1, 5, size=(5, 5))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array()) # doctest: +SKIP
[[1 1 3 1 2]
 [2 2 2 1 2]
 [2 2 3 1 3]
 [3 4 3 3 4]
 [4 4 2 1 1]]
>>> new_dataset = dataset.fill(10)
>>> print(new_dataset.read_array())
[[10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]]

Source code in src/pyramids/dataset/ops/analysis.py

def fill(
    self, value: float | int, inplace: bool = False, path: str | Path | None = None
) -> Dataset:
    """Fill the domain cells with a certain value.

        Fill takes a raster and fills it with one value

    Args:
        value (float | int):
            Numeric value to fill.
        inplace (bool):
            If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.
        path (str):
            Path including the extension (.tif).

    Returns:
        Dataset:
            A new Dataset with cells filled. If inplace is True, returns self.

    Examples:
        - Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 5, size=(5, 5))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array()) # doctest: +SKIP
          [[1 1 3 1 2]
           [2 2 2 1 2]
           [2 2 3 1 3]
           [3 4 3 3 4]
           [4 4 2 1 1]]
          >>> new_dataset = dataset.fill(10)
          >>> print(new_dataset.read_array())
          [[10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]]

          ```
    """
    no_data_value = self.no_data_value[0]
    src_array = self.raster.ReadAsArray()

    if no_data_value is None:
        no_data_value = np.nan

    if not np.isnan(no_data_value):
        src_array[~np.isclose(src_array, no_data_value, rtol=0.000001)] = value
    else:
        src_array[~np.isnan(src_array)] = value

    dst = type(self).dataset_like(self, src_array, path=path)
    if inplace:
        self._update_inplace(dst.raster)
        return self
    return dst

`extract(band=None, exclude_value=None, mask=None)` #

Extract.

Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
If the mask parameter is given, the raster will be clipped to the extent of the given mask and the values within the mask are extracted.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is None.	`None`
`exclude_value`	`Numeric`	Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value` will be filtered out from the first band only.	`None`
`mask`	`FeatureCollection \| GeoDataFrame`	Vector data containing point geometries at which to extract the values. Default is None.	`None`

Returns:

Type	Description
`ndarray`	np.ndarray: The extracted values from each band in the dataset will be in one row in the returned array.

Examples:

Extract all values from the dataset:

First, create a dataset with 2 bands, 4 rows and 4 columns:

>>> import numpy as np
>>> arr = np.random.randint(1, 5, size=(2, 4, 4))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 4 * 4
            EPSG: 4326
            Number of Bands: 2
            Band names: ['Band_1', 'Band_2']
            Mask: -9999.0
            Data type: int32
            File:...
<BLANKLINE>
>>> print(dataset.read_array()) # doctest: +SKIP
[[[1 3 3 4]
  [1 4 2 4]
  [2 4 2 1]
  [1 3 2 3]]
 [[3 2 1 3]
  [4 3 2 2]
  [2 2 3 4]
  [1 4 1 4]]]

Now, extract the values in the dataset:

>>> values = dataset.extract()
>>> print(values) # doctest: +SKIP
[[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
 [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

Extract all the values except 2:

>>> values = dataset.extract(exclude_value=2)
>>> print(values) # doctest: +SKIP

Extract values at the location of the given point geometries:

>>> import geopandas as gpd
>>> from shapely.geometry import Point

Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

>>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
>>> values = dataset.extract(mask=points)
>>> print(values) # doctest: +SKIP
[[4 3 3 4]
 [3 4 4 2]]

Source code in src/pyramids/dataset/ops/analysis.py

def extract(
    self,
    band: int | None = None,
    exclude_value: Any | None = None,
    mask: FeatureCollection | GeoDataFrame | None = None,
) -> np.ndarray:
    """Extract.

    - Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
    - If the mask parameter is given, the raster will be clipped to the extent of the given mask and the
      values within the mask are extracted.

    Args:
        band (int, optional):
            Band index. Default is None.
        exclude_value (Numeric, optional):
            Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value`
            will be filtered out from the first band only.
        mask (FeatureCollection | GeoDataFrame, optional):
            Vector data containing point geometries at which to extract the values. Default is None.

    Returns:
        np.ndarray:
            The extracted values from each band in the dataset will be in one row in the returned array.

    Examples:
        - Extract all values from the dataset:

          - First, create a dataset with 2 bands, 4 rows and 4 columns:

            ```python
            >>> import numpy as np
            >>> arr = np.random.randint(1, 5, size=(2, 4, 4))
            >>> top_left_corner = (0, 0)
            >>> cell_size = 0.05
            >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
            >>> print(dataset)
            <BLANKLINE>
                        Cell size: 0.05
                        Dimension: 4 * 4
                        EPSG: 4326
                        Number of Bands: 2
                        Band names: ['Band_1', 'Band_2']
                        Mask: -9999.0
                        Data type: int32
                        File:...
            <BLANKLINE>
            >>> print(dataset.read_array()) # doctest: +SKIP
            [[[1 3 3 4]
              [1 4 2 4]
              [2 4 2 1]
              [1 3 2 3]]
             [[3 2 1 3]
              [4 3 2 2]
              [2 2 3 4]
              [1 4 1 4]]]

            ```

          - Now, extract the values in the dataset:

            ```python
            >>> values = dataset.extract()
            >>> print(values) # doctest: +SKIP
            [[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
             [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

            ```

          - Extract all the values except 2:

            ```python
            >>> values = dataset.extract(exclude_value=2)
            >>> print(values) # doctest: +SKIP

            ```

        - Extract values at the location of the given point geometries:

          ```python
          >>> import geopandas as gpd
          >>> from shapely.geometry import Point
          ```

          - Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

            ```python
            >>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
            >>> values = dataset.extract(mask=points)
            >>> print(values) # doctest: +SKIP
            [[4 3 3 4]
             [3 4 4 2]]

            ```
    """
    # Optimize: make the read_array return only the array for inside the mask feature, and not to read the whole
    #  raster
    arr = self.read_array(band=band)
    no_data_value = (
        self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    )
    if mask is None:
        exclude_list = (
            [no_data_value, exclude_value]
            if exclude_value is not None
            else [no_data_value]
        )
        values = get_pixels2(arr, exclude_list)
    else:
        indices = self.map_to_array_coordinates(mask)
        if arr.ndim > 2:
            values = arr[:, indices[:, 0], indices[:, 1]]
        else:
            values = arr[indices[:, 0], indices[:, 1]]

    return np.asarray(values)

`overlay(classes_map, band=0, exclude_value=None)` #

Overlay.

Overlay method extracts all the values in the dataset for each class in the given class map.

Parameters:

Name	Type	Description	Default
`classes_map`	`Dataset`	Dataset object for the raster that has classes you want to overlay with the raster.	required
`band`	`int`	If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.	`0`
`exclude_value`	`Numeric`	Values you want to exclude from extracted values. Default is None.	`None`

Returns:

Name	Type	Description
`Dict`	`dict[list[float], list[float]]`	Dictionary with class values as keys (from the class map), and for each key a list of all the intersected values in the base map.

Examples:

Read the dataset:

>>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
>>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

rhine-rainfall

Read the classes dataset:

>>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
>>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

rhine-classes

Overlay the dataset with the classes dataset:

>>> classes_dict = dataset.overlay(classes)
>>> print(classes_dict.keys()) # doctest: +SKIP
dict_keys([1, 2, 3, 4, 5])

You can use the key 1 to get the values that overlay class 1.

Source code in src/pyramids/dataset/ops/analysis.py

def overlay(
    self: Dataset,
    classes_map,
    band: int = 0,
    exclude_value: float | int | None = None,
) -> dict[list[float], list[float]]:
    """Overlay.

    Overlay method extracts all the values in the dataset for each class in the given class map.

    Args:
        classes_map (Dataset):
            Dataset object for the raster that has classes you want to overlay with the raster.
        band (int):
            If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.
        exclude_value (Numeric, optional):
            Values you want to exclude from extracted values. Default is None.

    Returns:
        Dict:
            Dictionary with class values as keys (from the class map), and for each key a list of all the intersected
            values in the base map.

    Examples:
        - Read the dataset:

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
          >>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

          ```

          ![rhine-rainfall](./../../_images/dataset/rhine-rainfall.png)

        - Read the classes dataset:

          ```python
          >>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
          >>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

          ```

          ![rhine-classes](./../../_images/dataset/rhine-classes.png)

        - Overlay the dataset with the classes dataset:

          ```python
          >>> classes_dict = dataset.overlay(classes)
          >>> print(classes_dict.keys()) # doctest: +SKIP
          dict_keys([1, 2, 3, 4, 5])

          ```

        - You can use the key `1` to get the values that overlay class 1.
    """
    if not self._check_alignment(classes_map):
        raise AlignmentError(
            "The class Dataset is not aligned with the current raster, please use the method "
            "'align' to align both rasters."
        )
    arr = self.read_array(band=band)
    no_data_value = (
        self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    )
    mask = (
        [no_data_value, exclude_value]
        if exclude_value is not None
        else [no_data_value]
    )
    ind = get_indices2(arr, mask)
    classes = classes_map.read_array()
    values: dict[Any, list[Any]] = dict()

    # extract values
    for i, ind_i in enumerate(ind):
        # first check if the sub-basin has a list in the dict if not create a list
        key = classes[ind_i[0], ind_i[1]]
        if key not in list(values.keys()):
            values[key] = list()

        values[key].append(arr[ind_i[0], ind_i[1]])

    return values

`get_mask(band=0)` #

Get the mask array.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 0.	`0`

Returns:

Type	Description
`ndarray`	np.ndarray: Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.

Source code in src/pyramids/dataset/ops/analysis.py

def get_mask(self, band: int = 0) -> np.ndarray:
    """Get the mask array.

    Args:
        band (int):
            Band index. Default is 0.

    Returns:
        np.ndarray:
            Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.
    """
    # TODO: there is a CreateMaskBand method in the gdal.Dataset class, it creates a mask band for the dataset
    #   either internally or externally.
    arr = np.asarray(self._iloc(band).GetMaskBand().ReadAsArray())
    return arr

`footprint(band=0, exclude_values=None)` #

Extract the real coverage of the values in a certain band.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 0.	`0`
`exclude_values`	`List[Any] \| None`	If you want to exclude a certain value in the raster with another value inter the two values as a list of tuples a [(value_to_be_exclude_valuesd, new_value)]. Example of exclude_values usage: `>>> exclude_values = [0]` This parameter is introduced particularly in the case of rasters that has the no_data_value stored in the `no_data_value` property does not match the value stored in the band, so this option can correct this behavior.	`None`

Returns:

Name	Type	Description
`GeoDataFrame`	`GeoDataFrame \| None`	geodataframe containing the polygon representing the extent of the raster. the extent column should contain a value of 2 only. if the dataset had separate polygons, each polygon will be in a separate row.

Examples:

The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

>>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
>>> dataset.plot()
(<Figure size 800x800 with 2 Axes>, <Axes: >)

dataset-footprint-rhine-flood

Now, to extract the footprint of the dataset band, we need to specify the exclude_values parameter with the value of the non-flooded cells.

>>> extent = dataset.footprint(band=0, exclude_values=[0])
>>> print(extent)
   Band_1                                           geometry
0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

>>> extent.plot()
<Axes: >

dataset-footprint-rhine-flood-extent

Source code in src/pyramids/dataset/ops/analysis.py

def footprint(
    self: Dataset,
    band: int = 0,
    exclude_values: list[Any] | None = None,
) -> GeoDataFrame | None:
    """Extract the real coverage of the values in a certain band.

    Args:
        band (int):
            Band index. Default is 0.
        exclude_values (List[Any] | None):
            If you want to exclude a certain value in the raster with another value inter the two values as a
            list of tuples a [(value_to_be_exclude_valuesd, new_value)].

            - Example of exclude_values usage:

              ```python
              >>> exclude_values = [0]

              ```

            - This parameter is introduced particularly in the case of rasters that has the no_data_value stored in
              the `no_data_value` property does not match the value stored in the band, so this option can correct
              this behavior.

    Returns:
        GeoDataFrame:
            - geodataframe containing the polygon representing the extent of the raster. the extent column should
              contain a value of 2 only.
            - if the dataset had separate polygons, each polygon will be in a separate row.

    Examples:
        - The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with
          zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
          >>> dataset.plot()
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        ![dataset-footprint-rhine-flood](./../../_images/dataset/dataset-footprint-rhine-flood.png)

        - Now, to extract the footprint of the dataset band, we need to specify the `exclude_values` parameter with the
          value of the non-flooded cells.

          ```python
          >>> extent = dataset.footprint(band=0, exclude_values=[0])
          >>> print(extent)
             Band_1                                           geometry
          0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
          1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
          2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
          3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
          4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
          5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
          6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

          >>> extent.plot()
          <Axes: >

          ```

        ![dataset-footprint-rhine-flood-extent](./../../_images/dataset/dataset-footprint-rhine-flood-extent.png)

    """
    arr = self.read_array(band=band)
    no_data_val = self.no_data_value[band]

    if no_data_val is None:
        if not (np.isnan(arr)).any():
            self.logger.warning(
                "The nodata value stored in the raster does not exist in the raster "
                "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                " not correct"
            )
    else:
        if not (np.isclose(arr, no_data_val, rtol=0.00001)).any():
            self.logger.warning(
                "the nodata value stored in the raster does not exist in the raster "
                "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                " not correct"
            )
    # if you want to exclude_values any value in the raster
    if exclude_values:
        for val in exclude_values:
            try:
                # in case the val2 is None, and the array is int type, the following line will give error as None
                # is considered as float
                arr[np.isclose(arr, val)] = no_data_val
            except TypeError:
                arr = arr.astype(np.float32)
                arr[np.isclose(arr, val)] = no_data_val

    # replace all the values with 2
    if no_data_val is None:
        # check if the whole raster is full of no_data_value
        if (np.isnan(arr)).all():
            self.logger.warning("the raster is full of no_data_value")
            return None

        arr[~np.isnan(arr)] = 2
    else:
        # check if the whole raster is full of no_data_value
        if (np.isclose(arr, no_data_val, rtol=0.00001)).all():
            self.logger.warning("the raster is full of no_data_value")
            return None

        arr[~np.isclose(arr, no_data_val, rtol=0.00001)] = 2
    new_dataset = self.create_from_array(
        arr, geo=self.geotransform, epsg=self.epsg, no_data_value=self.no_data_value
    )
    # then convert the raster into polygon
    gdf = new_dataset.cluster2(band=band)
    gdf.rename(columns={"Band_1": self.band_names[band]}, inplace=True)

    return gdf

`normalize(array)` `staticmethod` #

Normalize numpy arrays into scale 0.0-1.0.

Parameters:

Name	Type	Description	Default
`array`	`ndarray`	Numpy array to normalize.	required

Returns:

Type	Description
`ndarray`	np.ndarray: Normalized array.

Source code in src/pyramids/dataset/ops/analysis.py

@staticmethod
def normalize(array: np.ndarray) -> np.ndarray:
    """Normalize numpy arrays into scale 0.0-1.0.

    Args:
        array (np.ndarray): Numpy array to normalize.

    Returns:
        np.ndarray: Normalized array.
    """
    array_min = array.min()
    array_max = array.max()
    val = (array - array_min) / (array_max - array_min)
    return np.asarray(val)

`get_histogram(band=0, bins=6, min_value=None, max_value=None, include_out_of_range=False, approx_ok=False)` #

Get histogram.

Parameters:

Name	Type	Description	Default
`band`	`int`	Band index. Default is 1.	`0`
`bins`	`int`	Number of bins. Default is 6.	`6`
`min_value`	`float`	Minimum value. Default is None.	`None`
`max_value`	`float`	Maximum value. Default is None.	`None`
`include_out_of_range`	`bool`	If True, add out-of-range values into the first and last buckets. Default is False.	`False`
`approx_ok`	`bool`	If True, compute an approximate histogram by using subsampling or overviews. Default is False.	`False`

Returns:

Type	Description
`tuple[list, list[tuple[Any, Any]]]`	tuple[list, list[tuple[Any, Any]]]: Histogram values and bin edges.

Hint

The value of the histogram will be stored in an xml file by the name of the raster file with the extension of .aux.xml.

The content of the file will be like the following:

    <PAMDataset>
      <PAMRasterBand band="1">
        <Description>Band_1</Description>
        <Histograms>
          <HistItem>
            <HistMin>0</HistMin>
            <HistMax>88</HistMax>
            <BucketCount>6</BucketCount>
            <IncludeOutOfRange>0</IncludeOutOfRange>
            <Approximate>0</Approximate>
            <HistCounts>75|6|0|4|2|1</HistCounts>
          </HistItem>
        </Histograms>
      </PAMRasterBand>
    </PAMDataset>

Examples:

Create Dataset consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

```python

import numpy as np arr = np.random.randint(1, 12, size=(10, 10)) print(arr) # doctest: +SKIP [[ 4 1 1 2 6 9 2 5 1 8] [ 1 11 5 6 2 5 4 6 6 7] [ 5 2 10 4 8 11 4 11 11 1] [ 2 3 6 3 1 5 11 10 10 7] [ 8 2 11 3 1 3 5 4 10 10] [ 1 2 1 6 10 3 6 4 2 8] [ 9 5 7 9 7 8 1 11 4 4] [ 7 7 2 2 5 3 7 2 9 9] [ 2 10 3 2 1 11 5 9 8 11] [ 1 5 6 11 3 3 8 1 2 1]] top_left_corner = (0, 0) cell_size = 0.05 dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

```

Now, let's get the histogram of the first band using the get_histogram method with the default parameters:

>>> hist, ranges = dataset.get_histogram(band=0)
>>> print(hist)  # doctest: +SKIP
[28, 17, 10, 15, 13, 7]
>>> print(ranges)   # doctest: +SKIP
[(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

we can also exclude values from the histogram by using the min_value and max_value:

>>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
>>> print(hist)  # doctest: +SKIP
[10, 8, 7, 7, 6, 0]
>>> print(ranges)   # doctest: +SKIP
[(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

For datasets with big dimensions, computing the histogram can take some time; approximating the computation of the histogram can save a lot of computation time. When using the parameter approx_ok with a True value the histogram will be calculated from resampling the band or from the overviews if they exist.
```
>>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
>>> print(hist)  # doctest: +SKIP
[28, 17, 10, 15, 13, 7]
>>> print(ranges)   # doctest: +SKIP
[(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]
```
As you see for small datasets, the approximation of the histogram will be the same as without approximation.

Source code in src/pyramids/dataset/ops/analysis.py

def get_histogram(
    self: Dataset,
    band: int = 0,
    bins: int = 6,
    min_value: float | None = None,
    max_value: float | None = None,
    include_out_of_range: bool = False,
    approx_ok: bool = False,
) -> tuple[list, list[tuple[Any, Any]]]:
    """Get histogram.

    Args:
        band (int, optional):
            Band index. Default is 1.
        bins (int, optional):
            Number of bins. Default is 6.
        min_value (float, optional):
            Minimum value. Default is None.
        max_value (float, optional):
            Maximum value. Default is None.
        include_out_of_range (bool, optional):
            If True, add out-of-range values into the first and last buckets. Default is False.
        approx_ok (bool, optional):
            If True, compute an approximate histogram by using subsampling or overviews. Default is False.

    Returns:
        tuple[list, list[tuple[Any, Any]]]:
            Histogram values and bin edges.

    Hint:
        - The value of the histogram will be stored in an xml file by the name of the raster file with the extension
            of .aux.xml.

        - The content of the file will be like the following:
          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <Description>Band_1</Description>
                  <Histograms>
                    <HistItem>
                      <HistMin>0</HistMin>
                      <HistMax>88</HistMax>
                      <BucketCount>6</BucketCount>
                      <IncludeOutOfRange>0</IncludeOutOfRange>
                      <Approximate>0</Approximate>
                      <HistCounts>75|6|0|4|2|1</HistCounts>
                    </HistItem>
                  </Histograms>
                </PAMRasterBand>
              </PAMDataset>

          ```

    Examples:
        - Create `Dataset` consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 12, size=(10, 10))
          >>> print(arr)    # doctest: +SKIP
          [[ 4  1  1  2  6  9  2  5  1  8]
           [ 1 11  5  6  2  5  4  6  6  7]
           [ 5  2 10  4  8 11  4 11 11  1]
           [ 2  3  6  3  1  5 11 10 10  7]
           [ 8  2 11  3  1  3  5  4 10 10]
           [ 1  2  1  6 10  3  6  4  2  8]
           [ 9  5  7  9  7  8  1 11  4  4]
           [ 7  7  2  2  5  3  7  2  9  9]
           [ 2 10  3  2  1 11  5  9  8 11]
           [ 1  5  6 11  3  3  8  1  2  1]]
           >>> top_left_corner = (0, 0)
           >>> cell_size = 0.05
           >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

           ```

        - Now, let's get the histogram of the first band using the `get_histogram` method with the default
            parameters:
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0)
            >>> print(hist)  # doctest: +SKIP
            [28, 17, 10, 15, 13, 7]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

            ```
        - we can also exclude values from the histogram by using the `min_value` and `max_value`:
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
            >>> print(hist)  # doctest: +SKIP
            [10, 8, 7, 7, 6, 0]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

            ```
        - For datasets with big dimensions, computing the histogram can take some time; approximating the computation
            of the histogram can save a lot of computation time. When using the parameter `approx_ok` with a `True`
            value the histogram will be calculated from resampling the band or from the overviews if they exist.
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
            >>> print(hist)  # doctest: +SKIP
            [28, 17, 10, 15, 13, 7]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

            ```
        - As you see for small datasets, the approximation of the histogram will be the same as without approximation.

    """
    band_obj = self._iloc(band)
    min_val, max_val = band_obj.ComputeRasterMinMax()
    if min_value is None:
        min_value = min_val
    if max_value is None:
        max_value = max_val

    bin_width = (max_value - min_value) / bins
    ranges = [
        (min_val + i * bin_width, min_val + (i + 1) * bin_width)
        for i in range(bins)
    ]

    hist = band_obj.GetHistogram(
        min=min_value,
        max=max_value,
        buckets=bins,
        include_out_of_range=include_out_of_range,
        approx_ok=approx_ok,
    )
    return hist, ranges

`plot(band=None, exclude_value=None, rgb=None, surface_reflectance=None, cutoff=None, overview=False, overview_index=0, percentile=None, **kwargs)` #

Plot the values/overviews of a given band. The plot function uses the cleopatra as a backend to plot the raster data, for more information check ArrayGlyph. Args: band (int, optional): The band you want to get its data. Default is 0. exclude_value (Any, optional): Value to exclude from the plot. Default is None. rgb (List[int], optional): The indices of the red, green, and blue bands in the Dataset. the rgb parameter can be a list of three values, or a list of four values if the alpha band is also included. The plot method will check if the rgb bands are defined in the Dataset, if all the three bands ( red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands will be considered as [2,1,0] as the default order for sentinel tif files. surface_reflectance (int, optional): Surface reflectance value for normalizing satellite data, by default None. Typically 10000 for Sentinel-2 data. cutoff (List, optional): clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff and scale them back to between 0 and 1). Default is None. overview (bool, optional): True if you want to plot the overview. Default is False. overview_index (int, optional): Index of the overview. Default is 0. percentile: int The percentile value to be used for scaling. kwargs: | Parameter | Type | Description | |-----------------------------|---------------------|-------------| | points | array | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. | | point_color | str | Color of the point. | | point_size | Any | Size of the point. | | pid_color | str | Color of the annotation of the point. Default is blue. | | pid_size | Any | Size of the point annotation. | | figsize | tuple, optional | Figure size. Default is (8, 8). | | title | str, optional | Title of the plot. Default is 'Total Discharge'. | | title_size | int, optional | Title size. Default is 15. | | orientation | str, optional | Orientation of the color bar (horizontal or vertical). Default is 'vertical'. | | rotation | number, optional | Rotation of the color bar label. Default is -90. | | cbar_length | float, optional | Ratio to control the height of the color bar. Default is 0.75. | | ticks_spacing | int, optional | Spacing between color bar ticks. Default is 2. | | cbar_label_size | int, optional | Size of the color bar label. Default is 12. | | cbar_label | str, optional | Label of the color bar. Default is 'Discharge m³/s'. | | color_scale | int, optional | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is 1. | | gamma | float, optional | Value needed for color scale option 2. Default is 1/2. | | line_threshold | float, optional | Value needed for color scale option 3. Default is 0.0001. | | line_scale | float, optional | Value needed for color scale option 3. Default is 0.001. | | bounds | list, optional | Discrete bounds for color scale option 4. Default is None. | | midpoint | float, optional | Value needed for color scale option 5. Default is 0. | | cmap | str, optional | Color map style. Default is 'coolwarm_r'. | | display_cell_value | bool, optional | Whether to display cell values as text. | | num_size | int, optional | Size of numbers plotted on top of each cell. Default is 8. | | background_color_threshold| float or int, optional | Threshold for deciding text color over cells: if value > threshold -> black text; else white text. If None, max value / 2 is used. Default is None. | Returns: ArrayGlyph: ArrayGlyph object. For more details of the ArrayGlyph object check the ArrayGlyph. Examples: - Plot a certain band:

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
>>> dataset.plot(band=0)
(<Figure size 800x800 with 2 Axes>, <Axes: >)

- plot using power scale.

>>> dataset.plot(band=0, color_scale="power")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

- plot using SymLogNorm scale.

>>> dataset.plot(band=0, color_scale="sym-lognorm")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

- plot using PowerNorm scale.

>>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
(<Figure size 800x800 with 2 Axes>, <Axes: >)

- plot using BoundaryNorm scale.

>>> dataset.plot(band=0, color_scale="midpoint")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

Source code in src/pyramids/dataset/ops/analysis.py

def plot(
    self,
    band: int | None = None,
    exclude_value: Any | None = None,
    rgb: list[int] | None = None,
    surface_reflectance: int | None = None,
    cutoff: list | None = None,
    overview: bool | None = False,
    overview_index: int | None = 0,
    percentile: int | None = None,
    **kwargs: Any,
) -> ArrayGlyph:
    """Plot the values/overviews of a given band.
    The plot function uses the `cleopatra` as a backend to plot the raster data, for more information check
    [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/#cleopatra.array_glyph.ArrayGlyph.plot).
    Args:
        band (int, optional):
            The band you want to get its data. Default is 0.
        exclude_value (Any, optional):
            Value to exclude from the plot. Default is None.
        rgb (List[int], optional):
            The indices of the red, green, and blue bands in the `Dataset`. the `rgb` parameter can be a list of
            three values, or a list of four values if the alpha band is also included.
            The `plot` method will check if the rgb bands are defined in the `Dataset`, if all the three bands (
            red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands
            will be considered as [2,1,0] as the default order for sentinel tif files.
        surface_reflectance (int, optional):
            Surface reflectance value for normalizing satellite data, by default None.
            Typically 10000 for Sentinel-2 data.
        cutoff (List, optional):
            clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff
            and scale them back to between 0 and 1). Default is None.
        overview (bool, optional):
            True if you want to plot the overview. Default is False.
        overview_index (int, optional):
            Index of the overview. Default is 0.
        percentile: int
            The percentile value to be used for scaling.
    kwargs:
            | Parameter                   | Type                | Description |
            |-----------------------------|---------------------|-------------|
            | `points`                    | array               | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. |
            | `point_color`               | str                 | Color of the point. |
            | `point_size`                | Any                 | Size of the point. |
            | `pid_color`                 | str                 | Color of the annotation of the point. Default is blue. |
            | `pid_size`                  | Any                 | Size of the point annotation. |
            | `figsize`                   | tuple, optional     | Figure size. Default is `(8, 8)`. |
            | `title`                     | str, optional       | Title of the plot. Default is `'Total Discharge'`. |
            | `title_size`                | int, optional       | Title size. Default is `15`. |
            | `orientation`               | str, optional       | Orientation of the color bar (`horizontal` or `vertical`). Default is `'vertical'`. |
            | `rotation`                  | number, optional    | Rotation of the color bar label. Default is `-90`. |
            | `cbar_length`               | float, optional     | Ratio to control the height of the color bar. Default is `0.75`. |
            | `ticks_spacing`             | int, optional       | Spacing between color bar ticks. Default is `2`. |
            | `cbar_label_size`           | int, optional       | Size of the color bar label. Default is `12`. |
            | `cbar_label`                | str, optional       | Label of the color bar. Default is `'Discharge m\u00b3/s'`. |
            | `color_scale`               | int, optional       | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is `1`. |
            | `gamma`                     | float, optional     | Value needed for color scale option 2. Default is `1/2`. |
            | `line_threshold`            | float, optional     | Value needed for color scale option 3. Default is `0.0001`. |
            | `line_scale`                | float, optional     | Value needed for color scale option 3. Default is `0.001`. |
            | `bounds`                    | list, optional      | Discrete bounds for color scale option 4. Default is `None`. |
            | `midpoint`                  | float, optional     | Value needed for color scale option 5. Default is `0`. |
            | `cmap`                      | str, optional       | Color map style. Default is `'coolwarm_r'`. |
            | `display_cell_value`        | bool, optional      | Whether to display cell values as text. |
            | `num_size`                  | int, optional       | Size of numbers plotted on top of each cell. Default is `8`. |
            | `background_color_threshold`| float or int, optional | Threshold for deciding text color over cells: if value > threshold -> black text; else white text. If `None`, max value / 2 is used. Default is `None`. |
    Returns:
        ArrayGlyph:
            ArrayGlyph object. For more details of the ArrayGlyph object check the [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/).
    Examples:
        - Plot a certain band:
          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
          >>> dataset.plot(band=0)
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using power scale.
          ```python
          >>> dataset.plot(band=0, color_scale="power")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using SymLogNorm scale.
          ```python
          >>> dataset.plot(band=0, color_scale="sym-lognorm")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using PowerNorm scale.
          ```python
          >>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using BoundaryNorm scale.
          ```python
          >>> dataset.plot(band=0, color_scale="midpoint")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
    """
    import_cleopatra(
        "The current function uses cleopatra package to for plotting, please install it manually, for more info "
        "check https://github.com/serapeum-org/cleopatra"
    )
    from cleopatra.array_glyph import ArrayGlyph
    no_data_value = [np.nan if i is None else i for i in self.no_data_value]
    if overview:
        arr = self.read_overview_array(
            band=band,
            overview_index=overview_index if overview_index is not None else 0,
        )
    else:
        arr = self.read_array(band=band)
    # if the raster has three bands or more.
    if self.band_count >= 3:
        if band is None:
            if rgb is None:
                rgb_candidate: list[int | None] = [
                    self.get_band_by_color("red"),
                    self.get_band_by_color("green"),
                    self.get_band_by_color("blue"),
                ]
                if None in rgb_candidate:
                    rgb = [2, 1, 0]
                else:
                    rgb = [int(v) for v in rgb_candidate if v is not None]
            # first make the band index the first band in the rgb list (red band)
            band = rgb[0]
    # elif self.band_count == 1:
    #     band = 0
    else:
        if band is None:
            band = 0
    exclude_value = (
        [no_data_value[band], exclude_value]
        if exclude_value is not None
        else [no_data_value[band]]
    )
    cleo = ArrayGlyph(
        arr,
        exclude_value=exclude_value,
        extent=self.bbox,
        rgb=rgb,
        surface_reflectance=surface_reflectance,
        cutoff=cutoff,
        percentile=percentile,
        **kwargs,
    )
    cleo.plot(**kwargs)
    return cleo