Skip to content

Analysis & Statistics#

Statistics, extraction, overlay, apply, fill, histogram, and plotting.

pyramids.dataset.ops.analysis.Analysis #

Mixin providing analysis, statistics, and data extraction operations for Dataset.

Source code in src/pyramids/dataset/ops/analysis.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
class Analysis:
    """Mixin providing analysis, statistics, and data extraction operations for Dataset."""

    def stats(
        self, band: int | None = None, mask: GeoDataFrame | None = None
    ) -> DataFrame:
        """Get statistics of a band [Min, max, mean, std].

        Args:
            band (int, optional):
                Band index. If None, the statistics of all bands will be returned.
            mask (Polygon GeoDataFrame or Dataset, optional):
                GeodataFrame with a geometry of polygon type.

        Returns:
            DataFrame:
                DataFrame wit the stats of each band, the dataframe has the following columns
                [min, max, mean, std], the index of the dataframe is the band names.

                ```text

                                   Min         max        mean       std
                    Band_1  270.369720  270.762299  270.551361  0.154270
                    Band_2  269.611938  269.744751  269.673645  0.043788
                    Band_3  273.641479  274.168823  273.953979  0.198447
                    Band_4  273.991516  274.540344  274.310669  0.205754
                ```

        Notes:
            - The value of the stats will be stored in an xml file by the name of the raster file with the extension of
              .aux.xml.
            - The content of the file will be like the following:

              ```xml

                  <PAMDataset>
                    <PAMRasterBand band="1">
                      <Description>Band_1</Description>
                      <Metadata>
                        <MDI key="RepresentationType">ATHEMATIC</MDI>
                        <MDI key="STATISTICS_MAXIMUM">88</MDI>
                        <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
                        <MDI key="STATISTICS_MINIMUM">0</MDI>
                        <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
                        <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
                      </Metadata>
                    </PAMRasterBand>
                  </PAMDataset>

              ```

        Examples:
            - Get the statistics of all bands in the dataset:

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 10, 10)
              >>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
              >>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
              >>> print(dataset.stats()) # doctest: +SKIP
                           min       max      mean       std
              Band_1  0.006443  0.942943  0.468935  0.266634
              Band_2  0.020377  0.978130  0.477189  0.306864
              Band_3  0.019652  0.992184  0.537215  0.286502
              Band_4  0.011955  0.984313  0.503616  0.295852
              >>> print(dataset.stats(band=1))  # doctest: +SKIP
                           min      max      mean       std
              Band_2  0.020377  0.97813  0.477189  0.306864

              ```

            - Get the statistics of all the bands using a mask polygon.

              - Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2,
                0.2 -0.1] to cover the 4 cells.
              ```python
              >>> from shapely.geometry import Polygon
              >>> import geopandas as gpd
              >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
              >>> print(dataset.stats(mask=mask))  # doctest: +SKIP
                           min       max      mean       std
              Band_1  0.193441  0.702108  0.541478  0.202932
              Band_2  0.281281  0.932573  0.665602  0.239410
              Band_3  0.031395  0.982235  0.493086  0.377608
              Band_4  0.079562  0.930965  0.591025  0.341578

              ```

        """
        dst: Dataset | None = None
        if mask is not None:
            dst = self.crop(mask, touch=True)

        if band is None:
            df = pd.DataFrame(
                index=self.band_names,
                columns=["min", "max", "mean", "std"],
                dtype=np.float32,
            )
            for i in range(self.band_count):
                if mask is not None and dst is not None:
                    df.iloc[i, :] = dst._get_stats(i)
                else:
                    df.iloc[i, :] = self._get_stats(i)
        else:
            df = pd.DataFrame(
                index=[self.band_names[band]],
                columns=["min", "max", "mean", "std"],
                dtype=np.float32,
            )
            if mask is not None and dst is not None:
                df.iloc[0, :] = dst._get_stats(band)
            else:
                df.iloc[0, :] = self._get_stats(band)

        return df

    def _get_stats(self, band: int | None = None) -> list[float]:
        """_get_stats."""
        band_index = band if band is not None else 0
        band_i = self._iloc(band_index)
        try:
            vals = band_i.GetStatistics(True, True)
        except RuntimeError:
            # when the GetStatistics gives an error "RuntimeError: Failed to compute statistics, no valid pixels
            # found in sampling."
            vals = [0]

        if sum(vals) == 0:
            warnings.warn(
                f"Band {band} has no statistics, and the statistics are going to be calculate"
            )
            vals = band_i.ComputeStatistics(False)

        return list(vals)

    def count_domain_cells(self, band: int = 0) -> int:
        """Count cells inside the domain.

        Args:
            band (int):
                Band index. Default is 0.

        Returns:
            int:
                Number of cells.
        """
        arr = self.read_array(band=band)
        domain_count = np.size(arr[:, :]) - np.count_nonzero(
            arr[np.isclose(arr, self.no_data_value[band], rtol=0.001)]
        )
        return int(domain_count)

    def apply(self, func, band: int = 0, inplace: bool = False) -> Dataset:
        """Apply a function to all domain cells.

        - apply method executes a mathematical operation on the raster array.
        - The function is applied to all domain cells at once using vectorized NumPy operations.

        Args:
            func (function):
                Defined function that takes one input (the cell value).
            band (int):
                Band number.
            inplace (bool):
                If True, the original dataset will be modified. If False, a new dataset will be created.
                Default is False.

        Returns:
            Dataset:
                A new Dataset with the function applied. If inplace is True, returns self.

        Examples:
            - Create a dataset from an array filled with values between -1 and 1:

              ```python
              >>> import numpy as np
              >>> arr = np.random.uniform(-1, 1, size=(5, 5))
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> print(dataset.read_array()) # doctest: +SKIP
              [[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
               [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
               [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
               [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
               [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

              ```

            - Apply the absolute function to the dataset:

              ```python
              >>> abs_dataset = dataset.apply(np.abs)
              >>> print(abs_dataset.read_array()) # doctest: +SKIP
              [[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
               [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
               [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
               [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
               [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

              ```
        """
        if not callable(func):
            raise TypeError("The second argument should be a function")

        no_data_value = self.no_data_value[band]
        src_array = self.read_array(band)
        dtype = self.gdal_dtype[band]

        new_array = np.full((self.rows, self.columns), no_data_value, dtype=src_array.dtype)
        domain_mask = ~np.isclose(src_array, no_data_value, rtol=0.001)
        domain_values = src_array[domain_mask]
        try:
            new_array[domain_mask] = func(domain_values)
        except (ValueError, TypeError):
            new_array[domain_mask] = np.vectorize(func)(domain_values)

        dst_obj = type(self)._build_dataset(
            self.columns, self.rows, 1, dtype, self.geotransform, self.crs, no_data_value
        )
        dst_obj.raster.GetRasterBand(1).WriteArray(new_array)

        if inplace:
            self._update_inplace(dst_obj.raster)
            return self
        return dst_obj

    def fill(
        self, value: float | int, inplace: bool = False, path: str | Path | None = None
    ) -> Dataset:
        """Fill the domain cells with a certain value.

            Fill takes a raster and fills it with one value

        Args:
            value (float | int):
                Numeric value to fill.
            inplace (bool):
                If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.
            path (str):
                Path including the extension (.tif).

        Returns:
            Dataset:
                A new Dataset with cells filled. If inplace is True, returns self.

        Examples:
            - Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

              ```python
              >>> import numpy as np
              >>> arr = np.random.randint(1, 5, size=(5, 5))
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> print(dataset.read_array()) # doctest: +SKIP
              [[1 1 3 1 2]
               [2 2 2 1 2]
               [2 2 3 1 3]
               [3 4 3 3 4]
               [4 4 2 1 1]]
              >>> new_dataset = dataset.fill(10)
              >>> print(new_dataset.read_array())
              [[10 10 10 10 10]
               [10 10 10 10 10]
               [10 10 10 10 10]
               [10 10 10 10 10]
               [10 10 10 10 10]]

              ```
        """
        no_data_value = self.no_data_value[0]
        src_array = self.raster.ReadAsArray()

        if no_data_value is None:
            no_data_value = np.nan

        if not np.isnan(no_data_value):
            src_array[~np.isclose(src_array, no_data_value, rtol=0.000001)] = value
        else:
            src_array[~np.isnan(src_array)] = value

        dst = type(self).dataset_like(self, src_array, path=path)
        if inplace:
            self._update_inplace(dst.raster)
            return self
        return dst

    def extract(
        self,
        band: int | None = None,
        exclude_value: Any | None = None,
        mask: FeatureCollection | GeoDataFrame | None = None,
    ) -> np.ndarray:
        """Extract.

        - Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
        - If the mask parameter is given, the raster will be clipped to the extent of the given mask and the
          values within the mask are extracted.

        Args:
            band (int, optional):
                Band index. Default is None.
            exclude_value (Numeric, optional):
                Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value`
                will be filtered out from the first band only.
            mask (FeatureCollection | GeoDataFrame, optional):
                Vector data containing point geometries at which to extract the values. Default is None.

        Returns:
            np.ndarray:
                The extracted values from each band in the dataset will be in one row in the returned array.

        Examples:
            - Extract all values from the dataset:

              - First, create a dataset with 2 bands, 4 rows and 4 columns:

                ```python
                >>> import numpy as np
                >>> arr = np.random.randint(1, 5, size=(2, 4, 4))
                >>> top_left_corner = (0, 0)
                >>> cell_size = 0.05
                >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
                >>> print(dataset)
                <BLANKLINE>
                            Cell size: 0.05
                            Dimension: 4 * 4
                            EPSG: 4326
                            Number of Bands: 2
                            Band names: ['Band_1', 'Band_2']
                            Mask: -9999.0
                            Data type: int32
                            File:...
                <BLANKLINE>
                >>> print(dataset.read_array()) # doctest: +SKIP
                [[[1 3 3 4]
                  [1 4 2 4]
                  [2 4 2 1]
                  [1 3 2 3]]
                 [[3 2 1 3]
                  [4 3 2 2]
                  [2 2 3 4]
                  [1 4 1 4]]]

                ```

              - Now, extract the values in the dataset:

                ```python
                >>> values = dataset.extract()
                >>> print(values) # doctest: +SKIP
                [[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
                 [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

                ```

              - Extract all the values except 2:

                ```python
                >>> values = dataset.extract(exclude_value=2)
                >>> print(values) # doctest: +SKIP

                ```

            - Extract values at the location of the given point geometries:

              ```python
              >>> import geopandas as gpd
              >>> from shapely.geometry import Point
              ```

              - Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

                ```python
                >>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
                >>> values = dataset.extract(mask=points)
                >>> print(values) # doctest: +SKIP
                [[4 3 3 4]
                 [3 4 4 2]]

                ```
        """
        # Optimize: make the read_array return only the array for inside the mask feature, and not to read the whole
        #  raster
        arr = self.read_array(band=band)
        no_data_value = (
            self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
        )
        if mask is None:
            exclude_list = (
                [no_data_value, exclude_value]
                if exclude_value is not None
                else [no_data_value]
            )
            values = get_pixels2(arr, exclude_list)
        else:
            indices = self.map_to_array_coordinates(mask)
            if arr.ndim > 2:
                values = arr[:, indices[:, 0], indices[:, 1]]
            else:
                values = arr[indices[:, 0], indices[:, 1]]

        return np.asarray(values)

    def overlay(
        self: Dataset,
        classes_map,
        band: int = 0,
        exclude_value: float | int | None = None,
    ) -> dict[list[float], list[float]]:
        """Overlay.

        Overlay method extracts all the values in the dataset for each class in the given class map.

        Args:
            classes_map (Dataset):
                Dataset object for the raster that has classes you want to overlay with the raster.
            band (int):
                If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.
            exclude_value (Numeric, optional):
                Values you want to exclude from extracted values. Default is None.

        Returns:
            Dict:
                Dictionary with class values as keys (from the class map), and for each key a list of all the intersected
                values in the base map.

        Examples:
            - Read the dataset:

              ```python
              >>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
              >>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

              ```

              ![rhine-rainfall](./../../_images/dataset/rhine-rainfall.png)

            - Read the classes dataset:

              ```python
              >>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
              >>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

              ```

              ![rhine-classes](./../../_images/dataset/rhine-classes.png)

            - Overlay the dataset with the classes dataset:

              ```python
              >>> classes_dict = dataset.overlay(classes)
              >>> print(classes_dict.keys()) # doctest: +SKIP
              dict_keys([1, 2, 3, 4, 5])

              ```

            - You can use the key `1` to get the values that overlay class 1.
        """
        if not self._check_alignment(classes_map):
            raise AlignmentError(
                "The class Dataset is not aligned with the current raster, please use the method "
                "'align' to align both rasters."
            )
        arr = self.read_array(band=band)
        no_data_value = (
            self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
        )
        mask = (
            [no_data_value, exclude_value]
            if exclude_value is not None
            else [no_data_value]
        )
        ind = get_indices2(arr, mask)
        classes = classes_map.read_array()
        values: dict[Any, list[Any]] = dict()

        # extract values
        for i, ind_i in enumerate(ind):
            # first check if the sub-basin has a list in the dict if not create a list
            key = classes[ind_i[0], ind_i[1]]
            if key not in list(values.keys()):
                values[key] = list()

            values[key].append(arr[ind_i[0], ind_i[1]])

        return values

    def get_mask(self, band: int = 0) -> np.ndarray:
        """Get the mask array.

        Args:
            band (int):
                Band index. Default is 0.

        Returns:
            np.ndarray:
                Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.
        """
        # TODO: there is a CreateMaskBand method in the gdal.Dataset class, it creates a mask band for the dataset
        #   either internally or externally.
        arr = np.asarray(self._iloc(band).GetMaskBand().ReadAsArray())
        return arr

    def footprint(
        self: Dataset,
        band: int = 0,
        exclude_values: list[Any] | None = None,
    ) -> GeoDataFrame | None:
        """Extract the real coverage of the values in a certain band.

        Args:
            band (int):
                Band index. Default is 0.
            exclude_values (List[Any] | None):
                If you want to exclude a certain value in the raster with another value inter the two values as a
                list of tuples a [(value_to_be_exclude_valuesd, new_value)].

                - Example of exclude_values usage:

                  ```python
                  >>> exclude_values = [0]

                  ```

                - This parameter is introduced particularly in the case of rasters that has the no_data_value stored in
                  the `no_data_value` property does not match the value stored in the band, so this option can correct
                  this behavior.

        Returns:
            GeoDataFrame:
                - geodataframe containing the polygon representing the extent of the raster. the extent column should
                  contain a value of 2 only.
                - if the dataset had separate polygons, each polygon will be in a separate row.

        Examples:
            - The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with
              zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

              ```python
              >>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
              >>> dataset.plot()
              (<Figure size 800x800 with 2 Axes>, <Axes: >)

              ```

            ![dataset-footprint-rhine-flood](./../../_images/dataset/dataset-footprint-rhine-flood.png)

            - Now, to extract the footprint of the dataset band, we need to specify the `exclude_values` parameter with the
              value of the non-flooded cells.

              ```python
              >>> extent = dataset.footprint(band=0, exclude_values=[0])
              >>> print(extent)
                 Band_1                                           geometry
              0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
              1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
              2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
              3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
              4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
              5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
              6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

              >>> extent.plot()
              <Axes: >

              ```

            ![dataset-footprint-rhine-flood-extent](./../../_images/dataset/dataset-footprint-rhine-flood-extent.png)

        """
        arr = self.read_array(band=band)
        no_data_val = self.no_data_value[band]

        if no_data_val is None:
            if not (np.isnan(arr)).any():
                self.logger.warning(
                    "The nodata value stored in the raster does not exist in the raster "
                    "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                    " not correct"
                )
        else:
            if not (np.isclose(arr, no_data_val, rtol=0.00001)).any():
                self.logger.warning(
                    "the nodata value stored in the raster does not exist in the raster "
                    "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                    " not correct"
                )
        # if you want to exclude_values any value in the raster
        if exclude_values:
            for val in exclude_values:
                try:
                    # in case the val2 is None, and the array is int type, the following line will give error as None
                    # is considered as float
                    arr[np.isclose(arr, val)] = no_data_val
                except TypeError:
                    arr = arr.astype(np.float32)
                    arr[np.isclose(arr, val)] = no_data_val

        # replace all the values with 2
        if no_data_val is None:
            # check if the whole raster is full of no_data_value
            if (np.isnan(arr)).all():
                self.logger.warning("the raster is full of no_data_value")
                return None

            arr[~np.isnan(arr)] = 2
        else:
            # check if the whole raster is full of no_data_value
            if (np.isclose(arr, no_data_val, rtol=0.00001)).all():
                self.logger.warning("the raster is full of no_data_value")
                return None

            arr[~np.isclose(arr, no_data_val, rtol=0.00001)] = 2
        new_dataset = self.create_from_array(
            arr, geo=self.geotransform, epsg=self.epsg, no_data_value=self.no_data_value
        )
        # then convert the raster into polygon
        gdf = new_dataset.cluster2(band=band)
        gdf.rename(columns={"Band_1": self.band_names[band]}, inplace=True)

        return gdf

    @staticmethod
    def normalize(array: np.ndarray) -> np.ndarray:
        """Normalize numpy arrays into scale 0.0-1.0.

        Args:
            array (np.ndarray): Numpy array to normalize.

        Returns:
            np.ndarray: Normalized array.
        """
        array_min = array.min()
        array_max = array.max()
        val = (array - array_min) / (array_max - array_min)
        return np.asarray(val)

    @staticmethod
    def _rescale(array: np.ndarray, min_value: float, max_value: float) -> np.ndarray:
        val = (array - min_value) / (max_value - min_value)
        return val

    def get_histogram(
        self: Dataset,
        band: int = 0,
        bins: int = 6,
        min_value: float | None = None,
        max_value: float | None = None,
        include_out_of_range: bool = False,
        approx_ok: bool = False,
    ) -> tuple[list, list[tuple[Any, Any]]]:
        """Get histogram.

        Args:
            band (int, optional):
                Band index. Default is 1.
            bins (int, optional):
                Number of bins. Default is 6.
            min_value (float, optional):
                Minimum value. Default is None.
            max_value (float, optional):
                Maximum value. Default is None.
            include_out_of_range (bool, optional):
                If True, add out-of-range values into the first and last buckets. Default is False.
            approx_ok (bool, optional):
                If True, compute an approximate histogram by using subsampling or overviews. Default is False.

        Returns:
            tuple[list, list[tuple[Any, Any]]]:
                Histogram values and bin edges.

        Hint:
            - The value of the histogram will be stored in an xml file by the name of the raster file with the extension
                of .aux.xml.

            - The content of the file will be like the following:
              ```xml

                  <PAMDataset>
                    <PAMRasterBand band="1">
                      <Description>Band_1</Description>
                      <Histograms>
                        <HistItem>
                          <HistMin>0</HistMin>
                          <HistMax>88</HistMax>
                          <BucketCount>6</BucketCount>
                          <IncludeOutOfRange>0</IncludeOutOfRange>
                          <Approximate>0</Approximate>
                          <HistCounts>75|6|0|4|2|1</HistCounts>
                        </HistItem>
                      </Histograms>
                    </PAMRasterBand>
                  </PAMDataset>

              ```

        Examples:
            - Create `Dataset` consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

              ```python
              >>> import numpy as np
              >>> arr = np.random.randint(1, 12, size=(10, 10))
              >>> print(arr)    # doctest: +SKIP
              [[ 4  1  1  2  6  9  2  5  1  8]
               [ 1 11  5  6  2  5  4  6  6  7]
               [ 5  2 10  4  8 11  4 11 11  1]
               [ 2  3  6  3  1  5 11 10 10  7]
               [ 8  2 11  3  1  3  5  4 10 10]
               [ 1  2  1  6 10  3  6  4  2  8]
               [ 9  5  7  9  7  8  1 11  4  4]
               [ 7  7  2  2  5  3  7  2  9  9]
               [ 2 10  3  2  1 11  5  9  8 11]
               [ 1  5  6 11  3  3  8  1  2  1]]
               >>> top_left_corner = (0, 0)
               >>> cell_size = 0.05
               >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

               ```

            - Now, let's get the histogram of the first band using the `get_histogram` method with the default
                parameters:
                ```python
                >>> hist, ranges = dataset.get_histogram(band=0)
                >>> print(hist)  # doctest: +SKIP
                [28, 17, 10, 15, 13, 7]
                >>> print(ranges)   # doctest: +SKIP
                [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

                ```
            - we can also exclude values from the histogram by using the `min_value` and `max_value`:
                ```python
                >>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
                >>> print(hist)  # doctest: +SKIP
                [10, 8, 7, 7, 6, 0]
                >>> print(ranges)   # doctest: +SKIP
                [(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

                ```
            - For datasets with big dimensions, computing the histogram can take some time; approximating the computation
                of the histogram can save a lot of computation time. When using the parameter `approx_ok` with a `True`
                value the histogram will be calculated from resampling the band or from the overviews if they exist.
                ```python
                >>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
                >>> print(hist)  # doctest: +SKIP
                [28, 17, 10, 15, 13, 7]
                >>> print(ranges)   # doctest: +SKIP
                [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

                ```
            - As you see for small datasets, the approximation of the histogram will be the same as without approximation.

        """
        band_obj = self._iloc(band)
        min_val, max_val = band_obj.ComputeRasterMinMax()
        if min_value is None:
            min_value = min_val
        if max_value is None:
            max_value = max_val

        bin_width = (max_value - min_value) / bins
        ranges = [
            (min_val + i * bin_width, min_val + (i + 1) * bin_width)
            for i in range(bins)
        ]

        hist = band_obj.GetHistogram(
            min=min_value,
            max=max_value,
            buckets=bins,
            include_out_of_range=include_out_of_range,
            approx_ok=approx_ok,
        )
        return hist, ranges
    def plot(
        self,
        band: int | None = None,
        exclude_value: Any | None = None,
        rgb: list[int] | None = None,
        surface_reflectance: int | None = None,
        cutoff: list | None = None,
        overview: bool | None = False,
        overview_index: int | None = 0,
        percentile: int | None = None,
        **kwargs: Any,
    ) -> ArrayGlyph:
        """Plot the values/overviews of a given band.
        The plot function uses the `cleopatra` as a backend to plot the raster data, for more information check
        [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/#cleopatra.array_glyph.ArrayGlyph.plot).
        Args:
            band (int, optional):
                The band you want to get its data. Default is 0.
            exclude_value (Any, optional):
                Value to exclude from the plot. Default is None.
            rgb (List[int], optional):
                The indices of the red, green, and blue bands in the `Dataset`. the `rgb` parameter can be a list of
                three values, or a list of four values if the alpha band is also included.
                The `plot` method will check if the rgb bands are defined in the `Dataset`, if all the three bands (
                red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands
                will be considered as [2,1,0] as the default order for sentinel tif files.
            surface_reflectance (int, optional):
                Surface reflectance value for normalizing satellite data, by default None.
                Typically 10000 for Sentinel-2 data.
            cutoff (List, optional):
                clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff
                and scale them back to between 0 and 1). Default is None.
            overview (bool, optional):
                True if you want to plot the overview. Default is False.
            overview_index (int, optional):
                Index of the overview. Default is 0.
            percentile: int
                The percentile value to be used for scaling.
        kwargs:
                | Parameter                   | Type                | Description |
                |-----------------------------|---------------------|-------------|
                | `points`                    | array               | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. |
                | `point_color`               | str                 | Color of the point. |
                | `point_size`                | Any                 | Size of the point. |
                | `pid_color`                 | str                 | Color of the annotation of the point. Default is blue. |
                | `pid_size`                  | Any                 | Size of the point annotation. |
                | `figsize`                   | tuple, optional     | Figure size. Default is `(8, 8)`. |
                | `title`                     | str, optional       | Title of the plot. Default is `'Total Discharge'`. |
                | `title_size`                | int, optional       | Title size. Default is `15`. |
                | `orientation`               | str, optional       | Orientation of the color bar (`horizontal` or `vertical`). Default is `'vertical'`. |
                | `rotation`                  | number, optional    | Rotation of the color bar label. Default is `-90`. |
                | `cbar_length`               | float, optional     | Ratio to control the height of the color bar. Default is `0.75`. |
                | `ticks_spacing`             | int, optional       | Spacing between color bar ticks. Default is `2`. |
                | `cbar_label_size`           | int, optional       | Size of the color bar label. Default is `12`. |
                | `cbar_label`                | str, optional       | Label of the color bar. Default is `'Discharge m\u00b3/s'`. |
                | `color_scale`               | int, optional       | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is `1`. |
                | `gamma`                     | float, optional     | Value needed for color scale option 2. Default is `1/2`. |
                | `line_threshold`            | float, optional     | Value needed for color scale option 3. Default is `0.0001`. |
                | `line_scale`                | float, optional     | Value needed for color scale option 3. Default is `0.001`. |
                | `bounds`                    | list, optional      | Discrete bounds for color scale option 4. Default is `None`. |
                | `midpoint`                  | float, optional     | Value needed for color scale option 5. Default is `0`. |
                | `cmap`                      | str, optional       | Color map style. Default is `'coolwarm_r'`. |
                | `display_cell_value`        | bool, optional      | Whether to display cell values as text. |
                | `num_size`                  | int, optional       | Size of numbers plotted on top of each cell. Default is `8`. |
                | `background_color_threshold`| float or int, optional | Threshold for deciding text color over cells: if value > threshold -> black text; else white text. If `None`, max value / 2 is used. Default is `None`. |
        Returns:
            ArrayGlyph:
                ArrayGlyph object. For more details of the ArrayGlyph object check the [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/).
        Examples:
            - Plot a certain band:
              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 10, 10)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
              >>> dataset.plot(band=0)
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using power scale.
              ```python
              >>> dataset.plot(band=0, color_scale="power")
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using SymLogNorm scale.
              ```python
              >>> dataset.plot(band=0, color_scale="sym-lognorm")
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using PowerNorm scale.
              ```python
              >>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
            - plot using BoundaryNorm scale.
              ```python
              >>> dataset.plot(band=0, color_scale="midpoint")
              (<Figure size 800x800 with 2 Axes>, <Axes: >)
              ```
        """
        import_cleopatra(
            "The current function uses cleopatra package to for plotting, please install it manually, for more info "
            "check https://github.com/serapeum-org/cleopatra"
        )
        from cleopatra.array_glyph import ArrayGlyph
        no_data_value = [np.nan if i is None else i for i in self.no_data_value]
        if overview:
            arr = self.read_overview_array(
                band=band,
                overview_index=overview_index if overview_index is not None else 0,
            )
        else:
            arr = self.read_array(band=band)
        # if the raster has three bands or more.
        if self.band_count >= 3:
            if band is None:
                if rgb is None:
                    rgb_candidate: list[int | None] = [
                        self.get_band_by_color("red"),
                        self.get_band_by_color("green"),
                        self.get_band_by_color("blue"),
                    ]
                    if None in rgb_candidate:
                        rgb = [2, 1, 0]
                    else:
                        rgb = [int(v) for v in rgb_candidate if v is not None]
                # first make the band index the first band in the rgb list (red band)
                band = rgb[0]
        # elif self.band_count == 1:
        #     band = 0
        else:
            if band is None:
                band = 0
        exclude_value = (
            [no_data_value[band], exclude_value]
            if exclude_value is not None
            else [no_data_value[band]]
        )
        cleo = ArrayGlyph(
            arr,
            exclude_value=exclude_value,
            extent=self.bbox,
            rgb=rgb,
            surface_reflectance=surface_reflectance,
            cutoff=cutoff,
            percentile=percentile,
            **kwargs,
        )
        cleo.plot(**kwargs)
        return cleo
    @staticmethod
    def _process_color_table(color_table: DataFrame) -> DataFrame:
        import_cleopatra(
            "The current function uses cleopatra package to for plotting, please install it manually, for more info"
            " check https://github.com/serapeum-org/cleopatra"
        )
        from cleopatra.colors import Colors
        # if the color_table does not contain the red, green, and blue columns, assume it has one column with
        # the color as hex and then, convert the color to rgb.
        if all(elem in color_table.columns for elem in ["red", "green", "blue"]):
            color_df = color_table.loc[:, ["values", "red", "green", "blue"]]
        elif "color" in color_table.columns:
            color = Colors(color_table["color"].tolist())
            color_rgb = color.to_rgb(normalized=False)
            color_df = DataFrame(columns=["values"])
            color_df["values"] = color_table["values"].to_list()
            color_df.loc[:, ["red", "green", "blue"]] = color_rgb
        else:
            raise ValueError(
                f"color_table must contain either red, green, blue, or color columns. given columns are: "
                f"{color_table.columns}"
            )
        if "alpha" not in color_table.columns:
            color_df.loc[:, "alpha"] = 255
        else:
            color_df.loc[:, "alpha"] = color_table["alpha"]
        return color_df

stats(band=None, mask=None) #

Get statistics of a band [Min, max, mean, std].

Parameters:

Name Type Description Default
band int

Band index. If None, the statistics of all bands will be returned.

None
mask Polygon GeoDataFrame or Dataset

GeodataFrame with a geometry of polygon type.

None

Returns:

Name Type Description
DataFrame DataFrame

DataFrame wit the stats of each band, the dataframe has the following columns [min, max, mean, std], the index of the dataframe is the band names.

                   Min         max        mean       std
    Band_1  270.369720  270.762299  270.551361  0.154270
    Band_2  269.611938  269.744751  269.673645  0.043788
    Band_3  273.641479  274.168823  273.953979  0.198447
    Band_4  273.991516  274.540344  274.310669  0.205754
Notes
  • The value of the stats will be stored in an xml file by the name of the raster file with the extension of .aux.xml.
  • The content of the file will be like the following:
    <PAMDataset>
      <PAMRasterBand band="1">
        <Description>Band_1</Description>
        <Metadata>
          <MDI key="RepresentationType">ATHEMATIC</MDI>
          <MDI key="STATISTICS_MAXIMUM">88</MDI>
          <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
          <MDI key="STATISTICS_MINIMUM">0</MDI>
          <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
          <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
        </Metadata>
      </PAMRasterBand>
    </PAMDataset>

Examples:

  • Get the statistics of all bands in the dataset:
>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
>>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
>>> print(dataset.stats()) # doctest: +SKIP
             min       max      mean       std
Band_1  0.006443  0.942943  0.468935  0.266634
Band_2  0.020377  0.978130  0.477189  0.306864
Band_3  0.019652  0.992184  0.537215  0.286502
Band_4  0.011955  0.984313  0.503616  0.295852
>>> print(dataset.stats(band=1))  # doctest: +SKIP
             min      max      mean       std
Band_2  0.020377  0.97813  0.477189  0.306864
  • Get the statistics of all the bands using a mask polygon.

  • Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2 -0.1] to cover the 4 cells.

    >>> from shapely.geometry import Polygon
    >>> import geopandas as gpd
    >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
    >>> print(dataset.stats(mask=mask))  # doctest: +SKIP
                 min       max      mean       std
    Band_1  0.193441  0.702108  0.541478  0.202932
    Band_2  0.281281  0.932573  0.665602  0.239410
    Band_3  0.031395  0.982235  0.493086  0.377608
    Band_4  0.079562  0.930965  0.591025  0.341578
    

Source code in src/pyramids/dataset/ops/analysis.py
def stats(
    self, band: int | None = None, mask: GeoDataFrame | None = None
) -> DataFrame:
    """Get statistics of a band [Min, max, mean, std].

    Args:
        band (int, optional):
            Band index. If None, the statistics of all bands will be returned.
        mask (Polygon GeoDataFrame or Dataset, optional):
            GeodataFrame with a geometry of polygon type.

    Returns:
        DataFrame:
            DataFrame wit the stats of each band, the dataframe has the following columns
            [min, max, mean, std], the index of the dataframe is the band names.

            ```text

                               Min         max        mean       std
                Band_1  270.369720  270.762299  270.551361  0.154270
                Band_2  269.611938  269.744751  269.673645  0.043788
                Band_3  273.641479  274.168823  273.953979  0.198447
                Band_4  273.991516  274.540344  274.310669  0.205754
            ```

    Notes:
        - The value of the stats will be stored in an xml file by the name of the raster file with the extension of
          .aux.xml.
        - The content of the file will be like the following:

          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <Description>Band_1</Description>
                  <Metadata>
                    <MDI key="RepresentationType">ATHEMATIC</MDI>
                    <MDI key="STATISTICS_MAXIMUM">88</MDI>
                    <MDI key="STATISTICS_MEAN">7.9662921348315</MDI>
                    <MDI key="STATISTICS_MINIMUM">0</MDI>
                    <MDI key="STATISTICS_STDDEV">18.294377743948</MDI>
                    <MDI key="STATISTICS_VALID_PERCENT">48.9</MDI>
                  </Metadata>
                </PAMRasterBand>
              </PAMDataset>

          ```

    Examples:
        - Get the statistics of all bands in the dataset:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> geotransform = (0, 0.05, 0, 0, 0, -0.05)
          >>> dataset = Dataset.create_from_array(arr, geo=geotransform, epsg=4326)
          >>> print(dataset.stats()) # doctest: +SKIP
                       min       max      mean       std
          Band_1  0.006443  0.942943  0.468935  0.266634
          Band_2  0.020377  0.978130  0.477189  0.306864
          Band_3  0.019652  0.992184  0.537215  0.286502
          Band_4  0.011955  0.984313  0.503616  0.295852
          >>> print(dataset.stats(band=1))  # doctest: +SKIP
                       min      max      mean       std
          Band_2  0.020377  0.97813  0.477189  0.306864

          ```

        - Get the statistics of all the bands using a mask polygon.

          - Create the polygon using shapely polygon, and use the xmin, ymin, xmax, ymax = [0.1, -0.2,
            0.2 -0.1] to cover the 4 cells.
          ```python
          >>> from shapely.geometry import Polygon
          >>> import geopandas as gpd
          >>> mask = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])],crs=4326)
          >>> print(dataset.stats(mask=mask))  # doctest: +SKIP
                       min       max      mean       std
          Band_1  0.193441  0.702108  0.541478  0.202932
          Band_2  0.281281  0.932573  0.665602  0.239410
          Band_3  0.031395  0.982235  0.493086  0.377608
          Band_4  0.079562  0.930965  0.591025  0.341578

          ```

    """
    dst: Dataset | None = None
    if mask is not None:
        dst = self.crop(mask, touch=True)

    if band is None:
        df = pd.DataFrame(
            index=self.band_names,
            columns=["min", "max", "mean", "std"],
            dtype=np.float32,
        )
        for i in range(self.band_count):
            if mask is not None and dst is not None:
                df.iloc[i, :] = dst._get_stats(i)
            else:
                df.iloc[i, :] = self._get_stats(i)
    else:
        df = pd.DataFrame(
            index=[self.band_names[band]],
            columns=["min", "max", "mean", "std"],
            dtype=np.float32,
        )
        if mask is not None and dst is not None:
            df.iloc[0, :] = dst._get_stats(band)
        else:
            df.iloc[0, :] = self._get_stats(band)

    return df

count_domain_cells(band=0) #

Count cells inside the domain.

Parameters:

Name Type Description Default
band int

Band index. Default is 0.

0

Returns:

Name Type Description
int int

Number of cells.

Source code in src/pyramids/dataset/ops/analysis.py
def count_domain_cells(self, band: int = 0) -> int:
    """Count cells inside the domain.

    Args:
        band (int):
            Band index. Default is 0.

    Returns:
        int:
            Number of cells.
    """
    arr = self.read_array(band=band)
    domain_count = np.size(arr[:, :]) - np.count_nonzero(
        arr[np.isclose(arr, self.no_data_value[band], rtol=0.001)]
    )
    return int(domain_count)

apply(func, band=0, inplace=False) #

Apply a function to all domain cells.

  • apply method executes a mathematical operation on the raster array.
  • The function is applied to all domain cells at once using vectorized NumPy operations.

Parameters:

Name Type Description Default
func function

Defined function that takes one input (the cell value).

required
band int

Band number.

0
inplace bool

If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.

False

Returns:

Name Type Description
Dataset Dataset

A new Dataset with the function applied. If inplace is True, returns self.

Examples:

  • Create a dataset from an array filled with values between -1 and 1:
>>> import numpy as np
>>> arr = np.random.uniform(-1, 1, size=(5, 5))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array()) # doctest: +SKIP
[[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
 [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
 [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
 [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
 [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]
  • Apply the absolute function to the dataset:
>>> abs_dataset = dataset.apply(np.abs)
>>> print(abs_dataset.read_array()) # doctest: +SKIP
[[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
 [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
 [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
 [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
 [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]
Source code in src/pyramids/dataset/ops/analysis.py
def apply(self, func, band: int = 0, inplace: bool = False) -> Dataset:
    """Apply a function to all domain cells.

    - apply method executes a mathematical operation on the raster array.
    - The function is applied to all domain cells at once using vectorized NumPy operations.

    Args:
        func (function):
            Defined function that takes one input (the cell value).
        band (int):
            Band number.
        inplace (bool):
            If True, the original dataset will be modified. If False, a new dataset will be created.
            Default is False.

    Returns:
        Dataset:
            A new Dataset with the function applied. If inplace is True, returns self.

    Examples:
        - Create a dataset from an array filled with values between -1 and 1:

          ```python
          >>> import numpy as np
          >>> arr = np.random.uniform(-1, 1, size=(5, 5))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array()) # doctest: +SKIP
          [[ 0.94997539 -0.80083622 -0.30948769 -0.77439961 -0.83836424]
           [-0.36810158 -0.23979251  0.88051216 -0.46882913  0.64511056]
           [ 0.50585374 -0.46905902  0.67856589  0.2779605   0.05589759]
           [ 0.63382852 -0.49259597  0.18471423 -0.49308984 -0.52840286]
           [-0.34076174 -0.53073014 -0.18485789 -0.40033474 -0.38962938]]

          ```

        - Apply the absolute function to the dataset:

          ```python
          >>> abs_dataset = dataset.apply(np.abs)
          >>> print(abs_dataset.read_array()) # doctest: +SKIP
          [[0.94997539 0.80083622 0.30948769 0.77439961 0.83836424]
           [0.36810158 0.23979251 0.88051216 0.46882913 0.64511056]
           [0.50585374 0.46905902 0.67856589 0.2779605  0.05589759]
           [0.63382852 0.49259597 0.18471423 0.49308984 0.52840286]
           [0.34076174 0.53073014 0.18485789 0.40033474 0.38962938]]

          ```
    """
    if not callable(func):
        raise TypeError("The second argument should be a function")

    no_data_value = self.no_data_value[band]
    src_array = self.read_array(band)
    dtype = self.gdal_dtype[band]

    new_array = np.full((self.rows, self.columns), no_data_value, dtype=src_array.dtype)
    domain_mask = ~np.isclose(src_array, no_data_value, rtol=0.001)
    domain_values = src_array[domain_mask]
    try:
        new_array[domain_mask] = func(domain_values)
    except (ValueError, TypeError):
        new_array[domain_mask] = np.vectorize(func)(domain_values)

    dst_obj = type(self)._build_dataset(
        self.columns, self.rows, 1, dtype, self.geotransform, self.crs, no_data_value
    )
    dst_obj.raster.GetRasterBand(1).WriteArray(new_array)

    if inplace:
        self._update_inplace(dst_obj.raster)
        return self
    return dst_obj

fill(value, inplace=False, path=None) #

Fill the domain cells with a certain value.

Fill takes a raster and fills it with one value

Parameters:

Name Type Description Default
value float | int

Numeric value to fill.

required
inplace bool

If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.

False
path str

Path including the extension (.tif).

None

Returns:

Name Type Description
Dataset Dataset

A new Dataset with cells filled. If inplace is True, returns self.

Examples:

  • Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):
>>> import numpy as np
>>> arr = np.random.randint(1, 5, size=(5, 5))
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.read_array()) # doctest: +SKIP
[[1 1 3 1 2]
 [2 2 2 1 2]
 [2 2 3 1 3]
 [3 4 3 3 4]
 [4 4 2 1 1]]
>>> new_dataset = dataset.fill(10)
>>> print(new_dataset.read_array())
[[10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]
 [10 10 10 10 10]]
Source code in src/pyramids/dataset/ops/analysis.py
def fill(
    self, value: float | int, inplace: bool = False, path: str | Path | None = None
) -> Dataset:
    """Fill the domain cells with a certain value.

        Fill takes a raster and fills it with one value

    Args:
        value (float | int):
            Numeric value to fill.
        inplace (bool):
            If True, the original dataset will be modified. If False, a new dataset will be created. Default is False.
        path (str):
            Path including the extension (.tif).

    Returns:
        Dataset:
            A new Dataset with cells filled. If inplace is True, returns self.

    Examples:
        - Create a Dataset with 1 band, 5 rows, 5 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 5, size=(5, 5))
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.read_array()) # doctest: +SKIP
          [[1 1 3 1 2]
           [2 2 2 1 2]
           [2 2 3 1 3]
           [3 4 3 3 4]
           [4 4 2 1 1]]
          >>> new_dataset = dataset.fill(10)
          >>> print(new_dataset.read_array())
          [[10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]
           [10 10 10 10 10]]

          ```
    """
    no_data_value = self.no_data_value[0]
    src_array = self.raster.ReadAsArray()

    if no_data_value is None:
        no_data_value = np.nan

    if not np.isnan(no_data_value):
        src_array[~np.isclose(src_array, no_data_value, rtol=0.000001)] = value
    else:
        src_array[~np.isnan(src_array)] = value

    dst = type(self).dataset_like(self, src_array, path=path)
    if inplace:
        self._update_inplace(dst.raster)
        return self
    return dst

extract(band=None, exclude_value=None, mask=None) #

Extract.

  • Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
  • If the mask parameter is given, the raster will be clipped to the extent of the given mask and the values within the mask are extracted.

Parameters:

Name Type Description Default
band int

Band index. Default is None.

None
exclude_value Numeric

Values to exclude from extracted values. If the dataset is multi-band, the values in exclude_value will be filtered out from the first band only.

None
mask FeatureCollection | GeoDataFrame

Vector data containing point geometries at which to extract the values. Default is None.

None

Returns:

Type Description
ndarray

np.ndarray: The extracted values from each band in the dataset will be in one row in the returned array.

Examples:

  • Extract all values from the dataset:

  • First, create a dataset with 2 bands, 4 rows and 4 columns:

    >>> import numpy as np
    >>> arr = np.random.randint(1, 5, size=(2, 4, 4))
    >>> top_left_corner = (0, 0)
    >>> cell_size = 0.05
    >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
    >>> print(dataset)
    <BLANKLINE>
                Cell size: 0.05
                Dimension: 4 * 4
                EPSG: 4326
                Number of Bands: 2
                Band names: ['Band_1', 'Band_2']
                Mask: -9999.0
                Data type: int32
                File:...
    <BLANKLINE>
    >>> print(dataset.read_array()) # doctest: +SKIP
    [[[1 3 3 4]
      [1 4 2 4]
      [2 4 2 1]
      [1 3 2 3]]
     [[3 2 1 3]
      [4 3 2 2]
      [2 2 3 4]
      [1 4 1 4]]]
    
  • Now, extract the values in the dataset:

    >>> values = dataset.extract()
    >>> print(values) # doctest: +SKIP
    [[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
     [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]
    
  • Extract all the values except 2:

    >>> values = dataset.extract(exclude_value=2)
    >>> print(values) # doctest: +SKIP
    
  • Extract values at the location of the given point geometries:

>>> import geopandas as gpd
>>> from shapely.geometry import Point
  • Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

    >>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
    >>> values = dataset.extract(mask=points)
    >>> print(values) # doctest: +SKIP
    [[4 3 3 4]
     [3 4 4 2]]
    
Source code in src/pyramids/dataset/ops/analysis.py
def extract(
    self,
    band: int | None = None,
    exclude_value: Any | None = None,
    mask: FeatureCollection | GeoDataFrame | None = None,
) -> np.ndarray:
    """Extract.

    - Extract method gets all the values in a raster, and excludes the values in the exclude_value parameter.
    - If the mask parameter is given, the raster will be clipped to the extent of the given mask and the
      values within the mask are extracted.

    Args:
        band (int, optional):
            Band index. Default is None.
        exclude_value (Numeric, optional):
            Values to exclude from extracted values. If the dataset is multi-band, the values in `exclude_value`
            will be filtered out from the first band only.
        mask (FeatureCollection | GeoDataFrame, optional):
            Vector data containing point geometries at which to extract the values. Default is None.

    Returns:
        np.ndarray:
            The extracted values from each band in the dataset will be in one row in the returned array.

    Examples:
        - Extract all values from the dataset:

          - First, create a dataset with 2 bands, 4 rows and 4 columns:

            ```python
            >>> import numpy as np
            >>> arr = np.random.randint(1, 5, size=(2, 4, 4))
            >>> top_left_corner = (0, 0)
            >>> cell_size = 0.05
            >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
            >>> print(dataset)
            <BLANKLINE>
                        Cell size: 0.05
                        Dimension: 4 * 4
                        EPSG: 4326
                        Number of Bands: 2
                        Band names: ['Band_1', 'Band_2']
                        Mask: -9999.0
                        Data type: int32
                        File:...
            <BLANKLINE>
            >>> print(dataset.read_array()) # doctest: +SKIP
            [[[1 3 3 4]
              [1 4 2 4]
              [2 4 2 1]
              [1 3 2 3]]
             [[3 2 1 3]
              [4 3 2 2]
              [2 2 3 4]
              [1 4 1 4]]]

            ```

          - Now, extract the values in the dataset:

            ```python
            >>> values = dataset.extract()
            >>> print(values) # doctest: +SKIP
            [[1 3 3 4 1 4 2 4 2 4 2 1 1 3 2 3]
             [3 2 1 3 4 3 2 2 2 2 3 4 1 4 1 4]]

            ```

          - Extract all the values except 2:

            ```python
            >>> values = dataset.extract(exclude_value=2)
            >>> print(values) # doctest: +SKIP

            ```

        - Extract values at the location of the given point geometries:

          ```python
          >>> import geopandas as gpd
          >>> from shapely.geometry import Point
          ```

          - Create the points using shapely and GeoPandas to cover the 4 cells with xmin, ymin, xmax, ymax = [0.1, -0.2, 0.2, -0.1]:

            ```python
            >>> points = gpd.GeoDataFrame(geometry=[Point(0.1, -0.1), Point(0.1, -0.2), Point(0.2, -0.2), Point(0.2, -0.1)],crs=4326)
            >>> values = dataset.extract(mask=points)
            >>> print(values) # doctest: +SKIP
            [[4 3 3 4]
             [3 4 4 2]]

            ```
    """
    # Optimize: make the read_array return only the array for inside the mask feature, and not to read the whole
    #  raster
    arr = self.read_array(band=band)
    no_data_value = (
        self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    )
    if mask is None:
        exclude_list = (
            [no_data_value, exclude_value]
            if exclude_value is not None
            else [no_data_value]
        )
        values = get_pixels2(arr, exclude_list)
    else:
        indices = self.map_to_array_coordinates(mask)
        if arr.ndim > 2:
            values = arr[:, indices[:, 0], indices[:, 1]]
        else:
            values = arr[indices[:, 0], indices[:, 1]]

    return np.asarray(values)

overlay(classes_map, band=0, exclude_value=None) #

Overlay.

Overlay method extracts all the values in the dataset for each class in the given class map.

Parameters:

Name Type Description Default
classes_map Dataset

Dataset object for the raster that has classes you want to overlay with the raster.

required
band int

If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.

0
exclude_value Numeric

Values you want to exclude from extracted values. Default is None.

None

Returns:

Name Type Description
Dict dict[list[float], list[float]]

Dictionary with class values as keys (from the class map), and for each key a list of all the intersected values in the base map.

Examples:

  • Read the dataset:
>>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
>>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

rhine-rainfall

  • Read the classes dataset:
>>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
>>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

rhine-classes

  • Overlay the dataset with the classes dataset:
>>> classes_dict = dataset.overlay(classes)
>>> print(classes_dict.keys()) # doctest: +SKIP
dict_keys([1, 2, 3, 4, 5])
  • You can use the key 1 to get the values that overlay class 1.
Source code in src/pyramids/dataset/ops/analysis.py
def overlay(
    self: Dataset,
    classes_map,
    band: int = 0,
    exclude_value: float | int | None = None,
) -> dict[list[float], list[float]]:
    """Overlay.

    Overlay method extracts all the values in the dataset for each class in the given class map.

    Args:
        classes_map (Dataset):
            Dataset object for the raster that has classes you want to overlay with the raster.
        band (int):
            If the raster is multi-band, choose the band you want to overlay with the classes map. Default is 0.
        exclude_value (Numeric, optional):
            Values you want to exclude from extracted values. Default is None.

    Returns:
        Dict:
            Dictionary with class values as keys (from the class map), and for each key a list of all the intersected
            values in the base map.

    Examples:
        - Read the dataset:

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/raster-folder/MSWEP_1979.01.01.tif")
          >>> dataset.plot(figsize=(6, 8)) # doctest: +SKIP

          ```

          ![rhine-rainfall](./../../_images/dataset/rhine-rainfall.png)

        - Read the classes dataset:

          ```python
          >>> classes = Dataset.read_file("examples/data/geotiff/rhine-classes.tif")
          >>> classes.plot(figsize=(6, 8), color_scale=4, bounds=[1,2,3,4,5,6]) # doctest: +SKIP

          ```

          ![rhine-classes](./../../_images/dataset/rhine-classes.png)

        - Overlay the dataset with the classes dataset:

          ```python
          >>> classes_dict = dataset.overlay(classes)
          >>> print(classes_dict.keys()) # doctest: +SKIP
          dict_keys([1, 2, 3, 4, 5])

          ```

        - You can use the key `1` to get the values that overlay class 1.
    """
    if not self._check_alignment(classes_map):
        raise AlignmentError(
            "The class Dataset is not aligned with the current raster, please use the method "
            "'align' to align both rasters."
        )
    arr = self.read_array(band=band)
    no_data_value = (
        self.no_data_value[0] if self.no_data_value[0] is not None else np.nan
    )
    mask = (
        [no_data_value, exclude_value]
        if exclude_value is not None
        else [no_data_value]
    )
    ind = get_indices2(arr, mask)
    classes = classes_map.read_array()
    values: dict[Any, list[Any]] = dict()

    # extract values
    for i, ind_i in enumerate(ind):
        # first check if the sub-basin has a list in the dict if not create a list
        key = classes[ind_i[0], ind_i[1]]
        if key not in list(values.keys()):
            values[key] = list()

        values[key].append(arr[ind_i[0], ind_i[1]])

    return values

get_mask(band=0) #

Get the mask array.

Parameters:

Name Type Description Default
band int

Band index. Default is 0.

0

Returns:

Type Description
ndarray

np.ndarray: Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.

Source code in src/pyramids/dataset/ops/analysis.py
def get_mask(self, band: int = 0) -> np.ndarray:
    """Get the mask array.

    Args:
        band (int):
            Band index. Default is 0.

    Returns:
        np.ndarray:
            Array of the mask. 0 value for cells out of the domain, and 255 for cells in the domain.
    """
    # TODO: there is a CreateMaskBand method in the gdal.Dataset class, it creates a mask band for the dataset
    #   either internally or externally.
    arr = np.asarray(self._iloc(band).GetMaskBand().ReadAsArray())
    return arr

footprint(band=0, exclude_values=None) #

Extract the real coverage of the values in a certain band.

Parameters:

Name Type Description Default
band int

Band index. Default is 0.

0
exclude_values List[Any] | None

If you want to exclude a certain value in the raster with another value inter the two values as a list of tuples a [(value_to_be_exclude_valuesd, new_value)].

  • Example of exclude_values usage:
>>> exclude_values = [0]
  • This parameter is introduced particularly in the case of rasters that has the no_data_value stored in the no_data_value property does not match the value stored in the band, so this option can correct this behavior.
None

Returns:

Name Type Description
GeoDataFrame GeoDataFrame | None
  • geodataframe containing the polygon representing the extent of the raster. the extent column should contain a value of 2 only.
  • if the dataset had separate polygons, each polygon will be in a separate row.

Examples:

  • The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with zero, so to extract the flood extent, we need to exclude the zero flood depth cells.
>>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
>>> dataset.plot()
(<Figure size 800x800 with 2 Axes>, <Axes: >)

dataset-footprint-rhine-flood

  • Now, to extract the footprint of the dataset band, we need to specify the exclude_values parameter with the value of the non-flooded cells.
>>> extent = dataset.footprint(band=0, exclude_values=[0])
>>> print(extent)
   Band_1                                           geometry
0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

>>> extent.plot()
<Axes: >

dataset-footprint-rhine-flood-extent

Source code in src/pyramids/dataset/ops/analysis.py
def footprint(
    self: Dataset,
    band: int = 0,
    exclude_values: list[Any] | None = None,
) -> GeoDataFrame | None:
    """Extract the real coverage of the values in a certain band.

    Args:
        band (int):
            Band index. Default is 0.
        exclude_values (List[Any] | None):
            If you want to exclude a certain value in the raster with another value inter the two values as a
            list of tuples a [(value_to_be_exclude_valuesd, new_value)].

            - Example of exclude_values usage:

              ```python
              >>> exclude_values = [0]

              ```

            - This parameter is introduced particularly in the case of rasters that has the no_data_value stored in
              the `no_data_value` property does not match the value stored in the band, so this option can correct
              this behavior.

    Returns:
        GeoDataFrame:
            - geodataframe containing the polygon representing the extent of the raster. the extent column should
              contain a value of 2 only.
            - if the dataset had separate polygons, each polygon will be in a separate row.

    Examples:
        - The following raster dataset has flood depth stored in its values, and the non-flooded cells are filled with
          zero, so to extract the flood extent, we need to exclude the zero flood depth cells.

          ```python
          >>> dataset = Dataset.read_file("examples/data/geotiff/rhine-flood.tif")
          >>> dataset.plot()
          (<Figure size 800x800 with 2 Axes>, <Axes: >)

          ```

        ![dataset-footprint-rhine-flood](./../../_images/dataset/dataset-footprint-rhine-flood.png)

        - Now, to extract the footprint of the dataset band, we need to specify the `exclude_values` parameter with the
          value of the non-flooded cells.

          ```python
          >>> extent = dataset.footprint(band=0, exclude_values=[0])
          >>> print(extent)
             Band_1                                           geometry
          0     2.0  POLYGON ((4070974.182 3181069.473, 4070974.182...
          1     2.0  POLYGON ((4077674.182 3181169.473, 4077674.182...
          2     2.0  POLYGON ((4091174.182 3169169.473, 4091174.182...
          3     2.0  POLYGON ((4088574.182 3176269.473, 4088574.182...
          4     2.0  POLYGON ((4082974.182 3167869.473, 4082974.182...
          5     2.0  POLYGON ((4092274.182 3168269.473, 4092274.182...
          6     2.0  POLYGON ((4072474.182 3181169.473, 4072474.182...

          >>> extent.plot()
          <Axes: >

          ```

        ![dataset-footprint-rhine-flood-extent](./../../_images/dataset/dataset-footprint-rhine-flood-extent.png)

    """
    arr = self.read_array(band=band)
    no_data_val = self.no_data_value[band]

    if no_data_val is None:
        if not (np.isnan(arr)).any():
            self.logger.warning(
                "The nodata value stored in the raster does not exist in the raster "
                "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                " not correct"
            )
    else:
        if not (np.isclose(arr, no_data_val, rtol=0.00001)).any():
            self.logger.warning(
                "the nodata value stored in the raster does not exist in the raster "
                "so either the raster extent is all full of data, or the no_data_value stored in the raster is"
                " not correct"
            )
    # if you want to exclude_values any value in the raster
    if exclude_values:
        for val in exclude_values:
            try:
                # in case the val2 is None, and the array is int type, the following line will give error as None
                # is considered as float
                arr[np.isclose(arr, val)] = no_data_val
            except TypeError:
                arr = arr.astype(np.float32)
                arr[np.isclose(arr, val)] = no_data_val

    # replace all the values with 2
    if no_data_val is None:
        # check if the whole raster is full of no_data_value
        if (np.isnan(arr)).all():
            self.logger.warning("the raster is full of no_data_value")
            return None

        arr[~np.isnan(arr)] = 2
    else:
        # check if the whole raster is full of no_data_value
        if (np.isclose(arr, no_data_val, rtol=0.00001)).all():
            self.logger.warning("the raster is full of no_data_value")
            return None

        arr[~np.isclose(arr, no_data_val, rtol=0.00001)] = 2
    new_dataset = self.create_from_array(
        arr, geo=self.geotransform, epsg=self.epsg, no_data_value=self.no_data_value
    )
    # then convert the raster into polygon
    gdf = new_dataset.cluster2(band=band)
    gdf.rename(columns={"Band_1": self.band_names[band]}, inplace=True)

    return gdf

normalize(array) staticmethod #

Normalize numpy arrays into scale 0.0-1.0.

Parameters:

Name Type Description Default
array ndarray

Numpy array to normalize.

required

Returns:

Type Description
ndarray

np.ndarray: Normalized array.

Source code in src/pyramids/dataset/ops/analysis.py
@staticmethod
def normalize(array: np.ndarray) -> np.ndarray:
    """Normalize numpy arrays into scale 0.0-1.0.

    Args:
        array (np.ndarray): Numpy array to normalize.

    Returns:
        np.ndarray: Normalized array.
    """
    array_min = array.min()
    array_max = array.max()
    val = (array - array_min) / (array_max - array_min)
    return np.asarray(val)

get_histogram(band=0, bins=6, min_value=None, max_value=None, include_out_of_range=False, approx_ok=False) #

Get histogram.

Parameters:

Name Type Description Default
band int

Band index. Default is 1.

0
bins int

Number of bins. Default is 6.

6
min_value float

Minimum value. Default is None.

None
max_value float

Maximum value. Default is None.

None
include_out_of_range bool

If True, add out-of-range values into the first and last buckets. Default is False.

False
approx_ok bool

If True, compute an approximate histogram by using subsampling or overviews. Default is False.

False

Returns:

Type Description
tuple[list, list[tuple[Any, Any]]]

tuple[list, list[tuple[Any, Any]]]: Histogram values and bin edges.

Hint
  • The value of the histogram will be stored in an xml file by the name of the raster file with the extension of .aux.xml.

  • The content of the file will be like the following:

        <PAMDataset>
          <PAMRasterBand band="1">
            <Description>Band_1</Description>
            <Histograms>
              <HistItem>
                <HistMin>0</HistMin>
                <HistMax>88</HistMax>
                <BucketCount>6</BucketCount>
                <IncludeOutOfRange>0</IncludeOutOfRange>
                <Approximate>0</Approximate>
                <HistCounts>75|6|0|4|2|1</HistCounts>
              </HistItem>
            </Histograms>
          </PAMRasterBand>
        </PAMDataset>
    

Examples:

  • Create Dataset consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

```python

import numpy as np arr = np.random.randint(1, 12, size=(10, 10)) print(arr) # doctest: +SKIP [[ 4 1 1 2 6 9 2 5 1 8] [ 1 11 5 6 2 5 4 6 6 7] [ 5 2 10 4 8 11 4 11 11 1] [ 2 3 6 3 1 5 11 10 10 7] [ 8 2 11 3 1 3 5 4 10 10] [ 1 2 1 6 10 3 6 4 2 8] [ 9 5 7 9 7 8 1 11 4 4] [ 7 7 2 2 5 3 7 2 9 9] [ 2 10 3 2 1 11 5 9 8 11] [ 1 5 6 11 3 3 8 1 2 1]] top_left_corner = (0, 0) cell_size = 0.05 dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

```

  • Now, let's get the histogram of the first band using the get_histogram method with the default parameters:
    >>> hist, ranges = dataset.get_histogram(band=0)
    >>> print(hist)  # doctest: +SKIP
    [28, 17, 10, 15, 13, 7]
    >>> print(ranges)   # doctest: +SKIP
    [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]
    
  • we can also exclude values from the histogram by using the min_value and max_value:
    >>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
    >>> print(hist)  # doctest: +SKIP
    [10, 8, 7, 7, 6, 0]
    >>> print(ranges)   # doctest: +SKIP
    [(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]
    
  • For datasets with big dimensions, computing the histogram can take some time; approximating the computation of the histogram can save a lot of computation time. When using the parameter approx_ok with a True value the histogram will be calculated from resampling the band or from the overviews if they exist.
    >>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
    >>> print(hist)  # doctest: +SKIP
    [28, 17, 10, 15, 13, 7]
    >>> print(ranges)   # doctest: +SKIP
    [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]
    
  • As you see for small datasets, the approximation of the histogram will be the same as without approximation.
Source code in src/pyramids/dataset/ops/analysis.py
def get_histogram(
    self: Dataset,
    band: int = 0,
    bins: int = 6,
    min_value: float | None = None,
    max_value: float | None = None,
    include_out_of_range: bool = False,
    approx_ok: bool = False,
) -> tuple[list, list[tuple[Any, Any]]]:
    """Get histogram.

    Args:
        band (int, optional):
            Band index. Default is 1.
        bins (int, optional):
            Number of bins. Default is 6.
        min_value (float, optional):
            Minimum value. Default is None.
        max_value (float, optional):
            Maximum value. Default is None.
        include_out_of_range (bool, optional):
            If True, add out-of-range values into the first and last buckets. Default is False.
        approx_ok (bool, optional):
            If True, compute an approximate histogram by using subsampling or overviews. Default is False.

    Returns:
        tuple[list, list[tuple[Any, Any]]]:
            Histogram values and bin edges.

    Hint:
        - The value of the histogram will be stored in an xml file by the name of the raster file with the extension
            of .aux.xml.

        - The content of the file will be like the following:
          ```xml

              <PAMDataset>
                <PAMRasterBand band="1">
                  <Description>Band_1</Description>
                  <Histograms>
                    <HistItem>
                      <HistMin>0</HistMin>
                      <HistMax>88</HistMax>
                      <BucketCount>6</BucketCount>
                      <IncludeOutOfRange>0</IncludeOutOfRange>
                      <Approximate>0</Approximate>
                      <HistCounts>75|6|0|4|2|1</HistCounts>
                    </HistItem>
                  </Histograms>
                </PAMRasterBand>
              </PAMDataset>

          ```

    Examples:
        - Create `Dataset` consists of 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0).

          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 12, size=(10, 10))
          >>> print(arr)    # doctest: +SKIP
          [[ 4  1  1  2  6  9  2  5  1  8]
           [ 1 11  5  6  2  5  4  6  6  7]
           [ 5  2 10  4  8 11  4 11 11  1]
           [ 2  3  6  3  1  5 11 10 10  7]
           [ 8  2 11  3  1  3  5  4 10 10]
           [ 1  2  1  6 10  3  6  4  2  8]
           [ 9  5  7  9  7  8  1 11  4  4]
           [ 7  7  2  2  5  3  7  2  9  9]
           [ 2 10  3  2  1 11  5  9  8 11]
           [ 1  5  6 11  3  3  8  1  2  1]]
           >>> top_left_corner = (0, 0)
           >>> cell_size = 0.05
           >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

           ```

        - Now, let's get the histogram of the first band using the `get_histogram` method with the default
            parameters:
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0)
            >>> print(hist)  # doctest: +SKIP
            [28, 17, 10, 15, 13, 7]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

            ```
        - we can also exclude values from the histogram by using the `min_value` and `max_value`:
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0, min_value=5, max_value=10)
            >>> print(hist)  # doctest: +SKIP
            [10, 8, 7, 7, 6, 0]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 1.835), (1.835, 2.67), (2.67, 3.5), (3.5, 4.34), (4.34, 5.167), (5.167, 6.0)]

            ```
        - For datasets with big dimensions, computing the histogram can take some time; approximating the computation
            of the histogram can save a lot of computation time. When using the parameter `approx_ok` with a `True`
            value the histogram will be calculated from resampling the band or from the overviews if they exist.
            ```python
            >>> hist, ranges = dataset.get_histogram(band=0, approx_ok=True)
            >>> print(hist)  # doctest: +SKIP
            [28, 17, 10, 15, 13, 7]
            >>> print(ranges)   # doctest: +SKIP
            [(1.0, 2.67), (2.67, 4.34), (4.34, 6.0), (6.0, 7.67), (7.67, 9.34), (9.34, 11.0)]

            ```
        - As you see for small datasets, the approximation of the histogram will be the same as without approximation.

    """
    band_obj = self._iloc(band)
    min_val, max_val = band_obj.ComputeRasterMinMax()
    if min_value is None:
        min_value = min_val
    if max_value is None:
        max_value = max_val

    bin_width = (max_value - min_value) / bins
    ranges = [
        (min_val + i * bin_width, min_val + (i + 1) * bin_width)
        for i in range(bins)
    ]

    hist = band_obj.GetHistogram(
        min=min_value,
        max=max_value,
        buckets=bins,
        include_out_of_range=include_out_of_range,
        approx_ok=approx_ok,
    )
    return hist, ranges

plot(band=None, exclude_value=None, rgb=None, surface_reflectance=None, cutoff=None, overview=False, overview_index=0, percentile=None, **kwargs) #

Plot the values/overviews of a given band. The plot function uses the cleopatra as a backend to plot the raster data, for more information check ArrayGlyph. Args: band (int, optional): The band you want to get its data. Default is 0. exclude_value (Any, optional): Value to exclude from the plot. Default is None. rgb (List[int], optional): The indices of the red, green, and blue bands in the Dataset. the rgb parameter can be a list of three values, or a list of four values if the alpha band is also included. The plot method will check if the rgb bands are defined in the Dataset, if all the three bands ( red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands will be considered as [2,1,0] as the default order for sentinel tif files. surface_reflectance (int, optional): Surface reflectance value for normalizing satellite data, by default None. Typically 10000 for Sentinel-2 data. cutoff (List, optional): clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff and scale them back to between 0 and 1). Default is None. overview (bool, optional): True if you want to plot the overview. Default is False. overview_index (int, optional): Index of the overview. Default is 0. percentile: int The percentile value to be used for scaling. kwargs: | Parameter | Type | Description | |-----------------------------|---------------------|-------------| | points | array | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. | | point_color | str | Color of the point. | | point_size | Any | Size of the point. | | pid_color | str | Color of the annotation of the point. Default is blue. | | pid_size | Any | Size of the point annotation. | | figsize | tuple, optional | Figure size. Default is (8, 8). | | title | str, optional | Title of the plot. Default is 'Total Discharge'. | | title_size | int, optional | Title size. Default is 15. | | orientation | str, optional | Orientation of the color bar (horizontal or vertical). Default is 'vertical'. | | rotation | number, optional | Rotation of the color bar label. Default is -90. | | cbar_length | float, optional | Ratio to control the height of the color bar. Default is 0.75. | | ticks_spacing | int, optional | Spacing between color bar ticks. Default is 2. | | cbar_label_size | int, optional | Size of the color bar label. Default is 12. | | cbar_label | str, optional | Label of the color bar. Default is 'Discharge m³/s'. | | color_scale | int, optional | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is 1. | | gamma | float, optional | Value needed for color scale option 2. Default is 1/2. | | line_threshold | float, optional | Value needed for color scale option 3. Default is 0.0001. | | line_scale | float, optional | Value needed for color scale option 3. Default is 0.001. | | bounds | list, optional | Discrete bounds for color scale option 4. Default is None. | | midpoint | float, optional | Value needed for color scale option 5. Default is 0. | | cmap | str, optional | Color map style. Default is 'coolwarm_r'. | | display_cell_value | bool, optional | Whether to display cell values as text. | | num_size | int, optional | Size of numbers plotted on top of each cell. Default is 8. | | background_color_threshold| float or int, optional | Threshold for deciding text color over cells: if value > threshold -> black text; else white text. If None, max value / 2 is used. Default is None. | Returns: ArrayGlyph: ArrayGlyph object. For more details of the ArrayGlyph object check the ArrayGlyph. Examples: - Plot a certain band:

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
>>> dataset.plot(band=0)
(<Figure size 800x800 with 2 Axes>, <Axes: >)
- plot using power scale.
>>> dataset.plot(band=0, color_scale="power")
(<Figure size 800x800 with 2 Axes>, <Axes: >)
- plot using SymLogNorm scale.
>>> dataset.plot(band=0, color_scale="sym-lognorm")
(<Figure size 800x800 with 2 Axes>, <Axes: >)
- plot using PowerNorm scale.
>>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
(<Figure size 800x800 with 2 Axes>, <Axes: >)
- plot using BoundaryNorm scale.
>>> dataset.plot(band=0, color_scale="midpoint")
(<Figure size 800x800 with 2 Axes>, <Axes: >)

Source code in src/pyramids/dataset/ops/analysis.py
def plot(
    self,
    band: int | None = None,
    exclude_value: Any | None = None,
    rgb: list[int] | None = None,
    surface_reflectance: int | None = None,
    cutoff: list | None = None,
    overview: bool | None = False,
    overview_index: int | None = 0,
    percentile: int | None = None,
    **kwargs: Any,
) -> ArrayGlyph:
    """Plot the values/overviews of a given band.
    The plot function uses the `cleopatra` as a backend to plot the raster data, for more information check
    [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/#cleopatra.array_glyph.ArrayGlyph.plot).
    Args:
        band (int, optional):
            The band you want to get its data. Default is 0.
        exclude_value (Any, optional):
            Value to exclude from the plot. Default is None.
        rgb (List[int], optional):
            The indices of the red, green, and blue bands in the `Dataset`. the `rgb` parameter can be a list of
            three values, or a list of four values if the alpha band is also included.
            The `plot` method will check if the rgb bands are defined in the `Dataset`, if all the three bands (
            red, green, blue)) are defined, the method will use them to plot the real image, if not the rgb bands
            will be considered as [2,1,0] as the default order for sentinel tif files.
        surface_reflectance (int, optional):
            Surface reflectance value for normalizing satellite data, by default None.
            Typically 10000 for Sentinel-2 data.
        cutoff (List, optional):
            clip the range of pixel values for each band. (take only the pixel values from 0 to the value of the cutoff
            and scale them back to between 0 and 1). Default is None.
        overview (bool, optional):
            True if you want to plot the overview. Default is False.
        overview_index (int, optional):
            Index of the overview. Default is 0.
        percentile: int
            The percentile value to be used for scaling.
    kwargs:
            | Parameter                   | Type                | Description |
            |-----------------------------|---------------------|-------------|
            | `points`                    | array               | 3 column array with the first column as the value to display for the point, the second as the row index, and the third as the column index in the array. The second and third columns tell the location of the point. |
            | `point_color`               | str                 | Color of the point. |
            | `point_size`                | Any                 | Size of the point. |
            | `pid_color`                 | str                 | Color of the annotation of the point. Default is blue. |
            | `pid_size`                  | Any                 | Size of the point annotation. |
            | `figsize`                   | tuple, optional     | Figure size. Default is `(8, 8)`. |
            | `title`                     | str, optional       | Title of the plot. Default is `'Total Discharge'`. |
            | `title_size`                | int, optional       | Title size. Default is `15`. |
            | `orientation`               | str, optional       | Orientation of the color bar (`horizontal` or `vertical`). Default is `'vertical'`. |
            | `rotation`                  | number, optional    | Rotation of the color bar label. Default is `-90`. |
            | `cbar_length`               | float, optional     | Ratio to control the height of the color bar. Default is `0.75`. |
            | `ticks_spacing`             | int, optional       | Spacing between color bar ticks. Default is `2`. |
            | `cbar_label_size`           | int, optional       | Size of the color bar label. Default is `12`. |
            | `cbar_label`                | str, optional       | Label of the color bar. Default is `'Discharge m\u00b3/s'`. |
            | `color_scale`               | int, optional       | Scale mode for colors. Options: 1 = normal, 2 = power, 3 = SymLogNorm, 4 = PowerNorm, 5 = BoundaryNorm. Default is `1`. |
            | `gamma`                     | float, optional     | Value needed for color scale option 2. Default is `1/2`. |
            | `line_threshold`            | float, optional     | Value needed for color scale option 3. Default is `0.0001`. |
            | `line_scale`                | float, optional     | Value needed for color scale option 3. Default is `0.001`. |
            | `bounds`                    | list, optional      | Discrete bounds for color scale option 4. Default is `None`. |
            | `midpoint`                  | float, optional     | Value needed for color scale option 5. Default is `0`. |
            | `cmap`                      | str, optional       | Color map style. Default is `'coolwarm_r'`. |
            | `display_cell_value`        | bool, optional      | Whether to display cell values as text. |
            | `num_size`                  | int, optional       | Size of numbers plotted on top of each cell. Default is `8`. |
            | `background_color_threshold`| float or int, optional | Threshold for deciding text color over cells: if value > threshold -> black text; else white text. If `None`, max value / 2 is used. Default is `None`. |
    Returns:
        ArrayGlyph:
            ArrayGlyph object. For more details of the ArrayGlyph object check the [ArrayGlyph](https://serapeum-org.github.io/cleopatra/latest/api/array-glyph-class/).
    Examples:
        - Plot a certain band:
          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
          >>> dataset.plot(band=0)
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using power scale.
          ```python
          >>> dataset.plot(band=0, color_scale="power")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using SymLogNorm scale.
          ```python
          >>> dataset.plot(band=0, color_scale="sym-lognorm")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using PowerNorm scale.
          ```python
          >>> dataset.plot(band=0, color_scale="boundary-norm", bounds=[0, 0.2, 0.4, 0.6, 0.8, 1])
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
        - plot using BoundaryNorm scale.
          ```python
          >>> dataset.plot(band=0, color_scale="midpoint")
          (<Figure size 800x800 with 2 Axes>, <Axes: >)
          ```
    """
    import_cleopatra(
        "The current function uses cleopatra package to for plotting, please install it manually, for more info "
        "check https://github.com/serapeum-org/cleopatra"
    )
    from cleopatra.array_glyph import ArrayGlyph
    no_data_value = [np.nan if i is None else i for i in self.no_data_value]
    if overview:
        arr = self.read_overview_array(
            band=band,
            overview_index=overview_index if overview_index is not None else 0,
        )
    else:
        arr = self.read_array(band=band)
    # if the raster has three bands or more.
    if self.band_count >= 3:
        if band is None:
            if rgb is None:
                rgb_candidate: list[int | None] = [
                    self.get_band_by_color("red"),
                    self.get_band_by_color("green"),
                    self.get_band_by_color("blue"),
                ]
                if None in rgb_candidate:
                    rgb = [2, 1, 0]
                else:
                    rgb = [int(v) for v in rgb_candidate if v is not None]
            # first make the band index the first band in the rgb list (red band)
            band = rgb[0]
    # elif self.band_count == 1:
    #     band = 0
    else:
        if band is None:
            band = 0
    exclude_value = (
        [no_data_value[band], exclude_value]
        if exclude_value is not None
        else [no_data_value[band]]
    )
    cleo = ArrayGlyph(
        arr,
        exclude_value=exclude_value,
        extent=self.bbox,
        rgb=rgb,
        surface_reflectance=surface_reflectance,
        cutoff=cutoff,
        percentile=percentile,
        **kwargs,
    )
    cleo.plot(**kwargs)
    return cleo