Skip to content

I/O Operations#

Array reading/writing, file serialization, tiling, and overview operations.

pyramids.dataset.ops.io.IO #

Source code in src/pyramids/dataset/ops/io.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
class IO:

    def read_array(
        self: Dataset, band: int | None = None, window: GeoDataFrame | list[int] | None = None
    ) -> np.ndarray:
        """Read the values stored in a given band.

        Data Chuncks/blocks
            When a raster dataset is stored on disk, it might not be stored as one continuous chunk of data. Instead,
            it can be divided into smaller rectangular blocks or tiles. These blocks can be individually accessed,
            which is particularly useful for large datasets:

                - Efficiency: Reading or writing small blocks requires less memory than dealing with the entire dataset
                      at once. This is especially beneficial when only a small portion of the data needs to be processed.
                - Performance: For certain file formats and operations, working with optimal block sizes can significantly
                      improve performance. For example, if the block size matches the reading or processing window,
                      Pyramids can minimize disk access and data transfer.

        Args:
            band (int, optional):
                The band you want to get its data. If None, data of all bands will be read. Default is None.
            window (List[int] | GeoDataFrame, optional):
                Specify a block of data to read from the dataset. The window can be specified in two ways:

                - List:
                    Window specified as a list of 4 integers [offset_x, offset_y, window_columns, window_rows].

                    - offset_x/column index: x offset of the block.
                    - offset_y/row index: y offset of the block.
                    - window_columns: number of columns in the block.
                    - window_rows: number of rows in the block.

                - GeoDataFrame:
                    GeoDataFrame with a geometry column filled with polygon geometries; the function will get the
                    total_bounds of the GeoDataFrame and use it as a window to read the raster.

        Returns:
            np.ndarray:
                array with all the values in the raster.

        Examples:
            - Create `Dataset` consisting of 4 bands, 5 rows, and 5 columns at the point lon/lat (0, 0):

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 5, 5)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

              ```

            - Read all the values stored in a given band:

              ```python
              >>> arr = dataset.read_array(band=0) # doctest: +SKIP
              array([[0.50482225, 0.45678043, 0.53294294, 0.28862223, 0.66753579],
                     [0.38471912, 0.14617829, 0.05045189, 0.00761358, 0.25501918],
                     [0.32689036, 0.37358843, 0.32233918, 0.75450564, 0.45197608],
                     [0.22944676, 0.2780928 , 0.71605189, 0.71859309, 0.61896933],
                     [0.47740168, 0.76490779, 0.07679277, 0.16142599, 0.73630836]])

              ```

            - Read a 2x2 block from the first band. The block starts at the 2nd column (index 1) and 2nd row (index 1)
                (the first index is the column index):

              ```python
              >>> arr = dataset.read_array(band=0, window=[1, 1, 2, 2])
              >>> print(arr) # doctest: +SKIP
              array([[0.14617829, 0.05045189],
                     [0.37358843, 0.32233918]])

              ```

            - If you check the values of the 2x2 block, you will find them the same as the values in the entire array
                of band 0, starting at the 2nd row and 2nd column.

            - Read a block using a GeoDataFrame polygon that covers the same area as the window above:

              ```python
              >>> import geopandas as gpd
              >>> from shapely.geometry import Polygon
              >>> poly = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)
              >>> arr = dataset.read_array(band=0, window=poly)
              >>> print(arr) # doctest: +SKIP
              array([[0.14617829, 0.05045189],
                     [0.37358843, 0.32233918]])

              ```

        See Also:
            - Dataset.get_tile: Read the dataset in chunks.
            - Dataset.get_block_arrangement: Get block arrangement to read the dataset in chunks.
        """
        if band is None and self.band_count > 1:
            rows = self.rows if window is None else window[3]
            columns = self.columns if window is None else window[2]
            arr = np.ones(
                (
                    self.band_count,
                    rows,
                    columns,
                ),
                dtype=self.numpy_dtype[0],
            )

            for i in range(self.band_count):
                if window is None:
                    # this line could be replaced with the following line
                    # arr[i, :, :] = self._iloc(i).ReadAsArray()
                    arr[i, :, :] = self._raster.GetRasterBand(i + 1).ReadAsArray()
                else:
                    arr[i, :, :] = self._read_block(i, window)
        else:
            # given band number or the raster has only one band
            if band is None:
                band = 0
            else:
                if band > self.band_count - 1:
                    raise ValueError(
                        f"band index should be between 0 and {self.band_count - 1}"
                    )
            if window is None:
                arr = self._iloc(band).ReadAsArray()
            else:
                arr = self._read_block(band, window)

        return arr

    def _read_block(
        self: Dataset, band: int, window: list[int] | GeoDataFrame | None = None
    ) -> np.ndarray:
        """Read block of data from the dataset.

        Args:
            band (int):
                Band index.
            window (List[int] | GeoDataFrame):
                - List[int]: Window to specify a block of data to read from the dataset.
                    The window should be a list of 4 integers [offset_x, offset_y, window_columns, window_rows].
                    - offset_x: x offset of the block.
                    - offset_y: y offset of the block.
                    - window_columns: number of columns in the block.
                    - window_rows: number of rows in the block.
                - GeoDataFrame:
                    A GeoDataFrame with a polygon geometry. The function will get the total_bounds of the
                    GeoDataFrame and use it as a window to read the raster.

        Returns:
            np.ndarray:
                Array with the values of the block. The shape of the array is (window[2], window[3]), and the
                location of the block in the raster is (window[0], window[1]).
        """
        if isinstance(window, GeoDataFrame):
            window = self._convert_polygon_to_window(window)
        if not isinstance(window, (list, tuple)):
            raise ValueError(f"window must be a list of 4 integers, got {type(window)}")
        try:
            block = self._iloc(band).ReadAsArray(
                window[0], window[1], window[2], window[3]
            )
        except Exception as e:
            if e.args[0].__contains__("Access window out of range in RasterIO()"):
                raise OutOfBoundsError(
                    f"The window you entered ({window})is out of the raster bounds: {self.rows, self.columns}"
                )
            else:
                raise e
        return np.asarray(block)

    def _convert_polygon_to_window(
        self: Dataset, poly: GeoDataFrame | FeatureCollection
    ) -> list[Any]:
        poly = FeatureCollection(poly)
        bounds = poly.total_bounds
        df = pd.DataFrame(columns=["id", "x", "y"])
        df.loc["top_left", ["x", "y"]] = bounds[0], bounds[3]
        df.loc["bottom_right", ["x", "y"]] = bounds[2], bounds[1]
        arr_indeces = self.map_to_array_coordinates(df)
        xoff = arr_indeces[0, 1]
        yoff = arr_indeces[0, 0]
        x_size = arr_indeces[1, 0] - arr_indeces[0, 0]
        y_size = arr_indeces[1, 1] - arr_indeces[0, 1]
        return [xoff, yoff, x_size, y_size]

    def write_array(self: Dataset, array: np.ndarray, top_left_corner: list[int]) -> None:
        """Write an array to the dataset at the given xoff, yoff position.

        Args:
            array (np.ndarray):
                The array to write
            top_left_corner (list[int]):
                indices [row, column]/[y_offset, x_offset] of the cell to write the array to.

        Raises:
            Exception: If the array is not written successfully.

        Hint:
            - The `Dataset` has to be opened in a write mode `read_only=False`.

        Returns:
        None

        Examples:
            - First, create a dataset on disk:

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(5, 5)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> path = 'write_array.tif'
              >>> dataset = Dataset.create_from_array(
              ...     arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326, path=path
              ... )
              >>> dataset = None

              ```

            - In a later session you can read the dataset in a `write` mode and update it:

              ```python
              >>> dataset = Dataset.read_file(path, read_only=False)
              >>> arr = np.array([[1, 2], [3, 4]])
              >>> dataset.write_array(arr, top_left_corner=[1, 1])
              >>> dataset.read_array()    # doctest: +SKIP
              array([[0.77359738, 0.64789596, 0.37912658, 0.03673771, 0.69571106],
                     [0.60804387, 1.        , 2.        , 0.501909  , 0.99597122],
                     [0.83879291, 3.        , 4.        , 0.33058081, 0.59824467],
                     [0.774213  , 0.94338147, 0.16443719, 0.28041457, 0.61914179],
                     [0.97201104, 0.81364799, 0.35157525, 0.65554998, 0.8589739 ]])

              ```
        """
        yoff, xoff = top_left_corner
        try:
            self._raster.WriteArray(array, xoff=xoff, yoff=yoff)
            self._raster.FlushCache()
        except Exception as e:
            raise e

    def get_block_arrangement(
        self: Dataset,
        band: int = 0,
        x_block_size: int | None = None,
        y_block_size: int | None = None,
    ) -> DataFrame:
        """Get Block Arrangement.

        Args:
            band (int, optional):
                band index, by default 0
            x_block_size (int, optional):
                x block size/number of columns, by default None
            y_block_size (int, optional):
                y block size/number of rows, by default None

        Returns:
            DataFrame:
                with the following columns: [x_offset, y_offset, window_xsize, window_ysize]

        Examples:
            - Example of getting block arrangement:

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(13, 14)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> df = dataset.get_block_arrangement(x_block_size=5, y_block_size=5)
              >>> print(df)
                 x_offset  y_offset  window_xsize  window_ysize
              0         0         0             5             5
              1         5         0             5             5
              2        10         0             4             5
              3         0         5             5             5
              4         5         5             5             5
              5        10         5             4             5
              6         0        10             5             3
              7         5        10             5             3
              8        10        10             4             3

              ```
        """
        block_sizes = self.block_size[band]
        x_block_size = block_sizes[0] if x_block_size is None else x_block_size
        y_block_size = block_sizes[1] if y_block_size is None else y_block_size

        df = pd.DataFrame(
            [
                {
                    "x_offset": x,
                    "y_offset": y,
                    "window_xsize": min(x_block_size, self.columns - x),
                    "window_ysize": min(y_block_size, self.rows - y),
                }
                for y in range(0, self.rows, y_block_size)
                for x in range(0, self.columns, x_block_size)
            ],
            columns=["x_offset", "y_offset", "window_xsize", "window_ysize"],
        )
        return df

    def to_file(
        self: Dataset,
        path: str | Path,
        band: int = 0,
        tile_length: int | None = None,
        creation_options: list[str] | None = None,
    ) -> None:
        """Save dataset to tiff file.

            `to_file` saves a raster to disk, the type of the driver (georiff/netcdf/ascii) will be implied from the
            extension at the end of the given path.

        Args:
            path (str):
                A path including the name of the dataset.
            band (int):
                Band index, needed only in case of ascii drivers. Default is 0.
            tile_length (int, optional):
                Length of the tiles in the driver. Default is 256.
            creation_options: List[str], Default is None
                List of strings that will be passed to the GDAL driver during the creation of the dataset.
                i.e., ['PREDICTOR=2']

        Examples:
            - Create a Dataset with 4 bands, 5 rows, 5 columns, at the point lon/lat (0, 0):

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 5, 5)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> print(dataset.file_name)
              <BLANKLINE>

              ```

            - Now save the dataset as a geotiff file:

              ```python
              >>> dataset.to_file("my-dataset.tif")
              >>> print(dataset.file_name)
              my-dataset.tif

              ```
        """
        if not isinstance(path, (str, Path)):
            raise TypeError(
                f"path input should be string or Path type, given: {type(path)}"
            )

        path = Path(path)
        extension = path.suffix[1:]
        driver = CATALOG.get_driver_name_by_extension(extension)
        driver_name = CATALOG.get_gdal_name(driver)

        if driver == "ascii":
            arr = self.read_array(band=band)
            no_data_value = self.no_data_value[band]
            xmin, ymin, _, _ = self.bbox
            _io.to_ascii(arr, self.cell_size, xmin, ymin, no_data_value, path)
        else:
            # saving rasters with color table fails with a runtime error
            options = ["COMPRESS=DEFLATE"]
            if tile_length is not None:
                options += [
                    "TILED=YES",
                    f"TILE_LENGTH={tile_length}",
                ]
                if self._block_size is not None and self._block_size != []:
                    options += [
                        "BLOCKXSIZE={}".format(self._block_size[0][0]),
                        "BLOCKYSIZE={}".format(self._block_size[0][1]),
                    ]
            if creation_options is not None:
                options += creation_options

            try:
                self.raster.FlushCache()
                dst = gdal.GetDriverByName(driver_name).CreateCopy(
                    str(path), self.raster, 0, options=options
                )
                self._update_inplace(dst, "write")
                # flush the data to the dataset on disk.
                dst.FlushCache()
            except RuntimeError:
                if not path.exists():
                    raise FailedToSaveError(
                        f"Failed to save the {driver_name} raster to the path: {path}"
                    )

    def _tile_offsets(self: Dataset, size: int = 256) -> Generator:
        """Dataset square window size/offsets.

        Args:
            size (int):
                Size of the window in pixels. One value required which is used for both the x and y size. e.g.,
                256 means a 256x256 window. Default is 256.

        Yields:
            tuple[int, int, int, int]:
                (x-offset/column-index, y-offset/row-index, x-size, y-size).

        Examples:
            - Generate 2x2 windows over a 3x5 dataset:

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(3, 5)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> tile_dimensions = list(dataset._tile_offsets(2))
              >>> print(tile_dimensions)
              [(0, 0, 2, 2), (2, 0, 2, 2), (4, 0, 1, 2), (0, 2, 2, 1), (2, 2, 2, 1), (4, 2, 1, 1)]

              ```
        """
        cols = self.columns
        rows = self.rows
        for yoff in range(0, rows, size):
            ysize = size if size + yoff <= rows else rows - yoff
            for xoff in range(0, cols, size):
                xsize = size if size + xoff <= cols else cols - xoff
                yield xoff, yoff, xsize, ysize

    def get_tile(self: Dataset, size=256) -> Generator[np.ndarray]:
        """Get tile.

        Args:
            size (int):
                Size of the window in pixels. One value is required which is used for both the x and y size. e.g., 256
                means a 256x256 window. Default is 256.

        Yields:
            np.ndarray:
                Dataset array with a shape `[band, y, x]`.

        Examples:
            - First, we will create a dataset with 3 rows and 5 columns.

              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(3, 5)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              >>> print(dataset)
              <BLANKLINE>
                          Cell size: 0.05
                          Dimension: 3 * 5
                          EPSG: 4326
                          Number of Bands: 1
                          Band names: ['Band_1']
                          Mask: -9999.0
                          Data type: float64
                          File:...
              <BLANKLINE>

              >>> print(dataset.read_array())   # doctest: +SKIP
              [[0.55332314 0.48364841 0.67794589 0.6901816  0.70516817]
               [0.82518332 0.75657103 0.45693945 0.44331782 0.74677865]
               [0.22231314 0.96283065 0.15201337 0.03522544 0.44616888]]

              ```
            - The `get_tile` method splits the domain into tiles of the specified `size` using the `_tile_offsets` function.

              ```python
              >>> tile_dimensions = list(dataset._tile_offsets(2))
              >>> print(tile_dimensions)
              [(0, 0, 2, 2), (2, 0, 2, 2), (4, 0, 1, 2), (0, 2, 2, 1), (2, 2, 2, 1), (4, 2, 1, 1)]

              ```
              ![get_tile](./../../_images/dataset/get_tile.png)

            - So the first two chunks are 2*2, 2*1 chunk, then two 1*2 chunks, and the last chunk is 1*1.
            - The `get_tile` method returns a generator object that can be used to iterate over the smaller chunks of
                the data.

              ```python
              >>> tiles_generator = dataset.get_tile(size=2)
              >>> print(tiles_generator)  # doctest: +SKIP
              <generator object Dataset.get_tile at 0x00000145AA39E680>
              >>> print(list(tiles_generator))  # doctest: +SKIP
              [
                  array([[0.55332314, 0.48364841],
                         [0.82518332, 0.75657103]]),
                  array([[0.67794589, 0.6901816 ],
                         [0.45693945, 0.44331782]]),
                  array([[0.70516817], [0.74677865]]),
                  array([[0.22231314, 0.96283065]]),
                  array([[0.15201337, 0.03522544]]),
                  array([[0.44616888]])
              ]

              ```
        """
        for xoff, yoff, xsize, ysize in self._tile_offsets(size=size):
            # read the array at certain indices
            yield self.raster.ReadAsArray(
                xoff=xoff, yoff=yoff, xsize=xsize, ysize=ysize
            )

    def map_blocks(
        self: Dataset,
        func: Callable[[np.ndarray], np.ndarray],
        tile_size: int = 256,
        band: int | None = None,
    ) -> Dataset:
        """Apply a function block-by-block without loading the full raster into memory.

        Reads the raster in tiles of `tile_size x tile_size`, applies `func` to each
        tile, and writes the result to a new in-memory Dataset. Neither the input nor
        the output array needs to fit in memory at once — only one tile at a time.

        This is the key enabler for processing rasters larger than RAM.

        Args:
            func (Callable[[np.ndarray], np.ndarray]):
                A function that takes a numpy array (the tile) and returns a numpy array
                of the same shape. The function should handle no-data values internally
                if needed.
            tile_size (int):
                Size of each square tile in pixels. Default is 256.
            band (int | None):
                Band index to process. If None, all bands are processed. Default is None.

        Returns:
            Dataset:
                A new Dataset with the function applied to every tile.

        Examples:
            - Apply a function block-by-block to avoid loading a large raster into memory:

              ```python
              >>> import numpy as np
              >>> arr = np.arange(1, 101, dtype=np.float32).reshape(10, 10)
              >>> dataset = Dataset.create_from_array(
              ...     arr, top_left_corner=(0, 0), cell_size=1.0, epsg=4326
              ... )
              >>> result = dataset.map_blocks(lambda tile: tile * 2, tile_size=5)
              >>> print(result.read_array()[0, 0])
              2.0

              ```
        """
        if band is not None:
            bands = 1
            dtype = self.gdal_dtype[band]
        else:
            bands = self.band_count
            dtype = self.gdal_dtype[0]

        if band is not None:
            no_data = [self.no_data_value[band]]
        else:
            no_data = self.no_data_value

        dst_obj = type(self)._build_dataset(
            self.columns, self.rows, bands, dtype,
            self.geotransform, self.crs, no_data,
        )

        for xoff, yoff, xsize, ysize in self._tile_offsets(size=tile_size):
            if band is not None:
                tile = self._iloc(band).ReadAsArray(xoff, yoff, xsize, ysize)
                result_tile = func(np.asarray(tile))
                dst_obj.raster.GetRasterBand(1).WriteArray(result_tile, xoff, yoff)
            else:
                for b in range(self.band_count):
                    tile = self._raster.GetRasterBand(b + 1).ReadAsArray(
                        xoff, yoff, xsize, ysize
                    )
                    result_tile = func(np.asarray(tile))
                    dst_obj.raster.GetRasterBand(b + 1).WriteArray(
                        result_tile, xoff, yoff
                    )

        return dst_obj

    def to_xyz(
        self: Dataset, bands: list[int] | None = None, path: str | Path | None = None
    ) -> DataFrame | None:
        """Convert to XYZ.

        Args:
            path (str, optional):
                path to the file where the data will be saved. If None, the data will be returned as a DataFrame.
                default is None.
            bands (List[int], optional):
                indices of the bands. If None, all bands will be used. default is None

        Returns:
            DataFrame/File:
                DataFrame with columns: lon, lat, band_1, band_2,... . If a path is provided the data will be saved to
                disk as a .xyz file

        Examples:
            - First we will create a dataset from a float32 array with values between 1 and 10, and then we will
                assign a scale of 0.1 to the dataset.
                ```python
                >>> import numpy as np
                >>> arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
                >>> top_left_corner = (0, 0)
                >>> cell_size = 0.05
                >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
                >>> print(dataset)
                <BLANKLINE>
                            Top Left Corner: (0.0, 0.0)
                            Cell size: 0.05
                            Dimension: 2 * 2
                            EPSG: 4326
                            Number of Bands: 2
                            Band names: ['Band_1', 'Band_2']
                            Band colors: {0: 'undefined', 1: 'undefined'}
                            Band units: ['', '']
                            Scale: [1.0, 1.0]
                            Offset: [0, 0]
                            Mask: -9999.0
                            Data type: int64
                            File: ...
                <BLANKLINE>
                >>> df = dataset.to_xyz()
                >>> print(df)
                     lon    lat  Band_1  Band_2
                0  0.025 -0.025       1       5
                1  0.075 -0.025       2       6
                2  0.025 -0.075       3       7
                3  0.075 -0.075       4       8
                ```
        """
        if bands is None:
            bands = range(1, self.band_count + 1)
        elif isinstance(bands, int):
            bands = [bands + 1]
        elif isinstance(bands, list):
            bands = [band + 1 for band in bands]
        else:
            raise ValueError("bands must be an integer or a list of integers.")

        band_nums = bands
        arr = gdal2xyz.gdal2xyz(
            self.raster,
            str(path) if path is not None else None,
            skip_nodata=True,
            return_np_arrays=True,
            band_nums=band_nums,
        )
        if path is None:
            band_names = []
            if bands is not None:
                for band in bands:
                    band_names.append(self.band_names[band - 1])
            else:
                band_names = self.band_names

            df = pd.DataFrame(columns=["lon", "lat"] + band_names)
            df["lon"] = arr[0]
            df["lat"] = arr[1]
            df[band_names] = arr[2].transpose()
            result = df
        else:
            result = None
        return result
    @property
    def overview_count(self: Dataset) -> list[int]:
        """Number of the overviews for each band."""
        overview_number = []
        for i in range(self.band_count):
            overview_number.append(self._iloc(i).GetOverviewCount())
        return overview_number
    def create_overviews(
        self: Dataset,
        resampling_method: str = "nearest",
        overview_levels: list | None = None,
    ) -> None:
        """Create overviews for the dataset.
        Args:
            resampling_method (str):
                The resampling method used to create the overviews. Possible values are
                "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE",
                "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".
            overview_levels (list, optional):
                The overview levels. Restricted to typical power-of-two reduction factors. Defaults to [2, 4, 8, 16,
                32].
        Returns:
            None:
                Creates internal or external overviews depending on the dataset access mode. See Notes.
        Notes:
            - External (.ovr file): If the dataset is read with `read_only=True` then the overviews file will be created
              as an external .ovr file in the same directory of the dataset.
            - Internal: If the dataset is read with `read_only=False` then the overviews will be created internally in
              the dataset, and the dataset needs to be saved/flushed to persist the changes to disk.
            - You can check the count per band via the `overview_count` property.
        Examples:
            - Create a Dataset with 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0):
              ```python
              >>> import numpy as np
              >>> arr = np.random.rand(4, 10, 10)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              ```
            - Now, create overviews using the default parameters:
              ```python
              >>> dataset.create_overviews()
              >>> print(dataset.overview_count)  # doctest: +SKIP
              [4, 4, 4, 4]
              ```
            - For each band, there are 4 overview levels you can use to plot the bands:
              ```python
              >>> dataset.plot(band=0, overview=True, overview_index=0) # doctest: +SKIP
              ```
              ![overviews-level-0](./../../_images/dataset/overviews-level-0.png)
            - However, the dataset originally is 10*10, but the first overview level (2) displays half of the cells by
              aggregating all the cells using the nearest neighbor. The second level displays only 3 cells in each:
              ```python
              >>> dataset.plot(band=0, overview=True, overview_index=1)   # doctest: +SKIP
              ```
              ![overviews-level-1](./../../_images/dataset/overviews-level-1.png)
            - For the third overview level:
              ```python
              >>> dataset.plot(band=0, overview=True, overview_index=2)       # doctest: +SKIP
              ```
              ![overviews-level-2](./../../_images/dataset/overviews-level-2.png)
        See Also:
            - Dataset.recreate_overviews: Recreate the dataset overviews if they exist
            - Dataset.get_overview: Get an overview of a band
            - Dataset.overview_count: Number of overviews
            - Dataset.read_overview_array: Read overview values
            - Dataset.plot: Plot a band
        """
        if overview_levels is None:
            overview_levels = OVERVIEW_LEVELS
        else:
            if not isinstance(overview_levels, list):
                raise TypeError("overview_levels should be a list")
            # if self.raster.HasArbitraryOverviews():
            if not all(elem in OVERVIEW_LEVELS for elem in overview_levels):
                raise ValueError(
                    "overview_levels are restricted to the typical power-of-two reduction factors "
                    "(like 2, 4, 8, 16, etc.)"
                )
        if resampling_method.upper() not in RESAMPLING_METHODS:
            raise ValueError(f"resampling_method should be one of {RESAMPLING_METHODS}")
        # Define the overview levels (the reduction factor).
        # e.g., 2 means the overview will be half the resolution of the original dataset.
        # Build overviews using nearest neighbor resampling
        # NEAREST is the resampling method used. Other methods include AVERAGE, GAUSS, etc.
        self.raster.BuildOverviews(resampling_method, overview_levels)
    def recreate_overviews(self: Dataset, resampling_method: str = "nearest") -> None:
        """Recreate overviews for the dataset.
        Args:
            resampling_method (str): Resampling method used to recreate overviews. Possible values are
                "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE",
                "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".
        Raises:
            ValueError:
                If resampling_method is not one of the allowed values above.
            ReadOnlyError:
                If overviews are internal and the dataset is opened read-only. Read with read_only=False.
        See Also:
            - Dataset.create_overviews: Recreate the dataset overviews if they exist.
            - Dataset.get_overview: Get an overview of a band.
            - Dataset.overview_count: Number of overviews.
            - Dataset.read_overview_array: Read overview values.
            - Dataset.plot: Plot a band.
        """
        if resampling_method.upper() not in RESAMPLING_METHODS:
            raise ValueError(f"resampling_method should be one of {RESAMPLING_METHODS}")
        # Build overviews using nearest neighbor resampling
        # nearest is the resampling method used. Other methods include AVERAGE, GAUSS, etc.
        try:
            for i in range(self.band_count):
                band = self._iloc(i)
                for j in range(self.overview_count[i]):
                    ovr = self.get_overview(i, j)
                    # TODO: if this method takes a long time, we can use the gdal.RegenerateOverviews() method
                    #  which is faster but it does not give the option to choose the resampling method. and the
                    #  overviews has to be given to the function as a list.
                    #  overviews = [band.GetOverview(i) for i in range(band.GetOverviewCount())]
                    #  band.RegenerateOverviews(overviews) or gdal.RegenerateOverviews(overviews)
                    gdal.RegenerateOverview(band, ovr, resampling_method)
        except RuntimeError:
            raise ReadOnlyError(
                "The Dataset is opened with a read only. Please read the dataset using read_only=False"
            )
    def get_overview(self: Dataset, band: int = 0, overview_index: int = 0) -> gdal.Band:
        """Get an overview of a band.
        Args:
            band (int):
                The band index. Defaults to 0.
            overview_index (int):
                Index of the overview. Defaults to 0.
        Returns:
            gdal.Band:
                GDAL band object.
        Examples:
            - Create `Dataset` consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):
              ```python
              >>> import numpy as np
              >>> arr = np.random.randint(1, 10, size=(4, 10, 10))
              >>> print(arr[0, :, :]) # doctest: +SKIP
              array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
                     [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
                     [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
                     [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
                     [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
                     [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
                     [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
                     [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
                     [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
                     [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              ```
            - Now, create overviews using the default parameters and inspect them:
              ```python
              >>> dataset.create_overviews()
              >>> print(dataset.overview_count)  # doctest: +SKIP
              [4, 4, 4, 4]
              >>> ovr = dataset.get_overview(band=0, overview_index=0)
              >>> print(ovr)  # doctest: +SKIP
              <osgeo.gdal.Band; proxy of <Swig Object of type 'GDALRasterBandShadow *' at 0x0000017E2B5AF1B0> >
              >>> ovr.ReadAsArray()  # doctest: +SKIP
              array([[6, 3, 4, 4, 8],
                     [5, 9, 7, 5, 6],
                     [5, 3, 1, 7, 5],
                     [1, 4, 2, 6, 9],
                     [4, 3, 6, 5, 8]], dtype=int32)
              >>> ovr = dataset.get_overview(band=0, overview_index=1)
              >>> ovr.ReadAsArray()  # doctest: +SKIP
              array([[6, 7, 3],
                     [2, 5, 6],
                     [6, 9, 9]], dtype=int32)
              >>> ovr = dataset.get_overview(band=0, overview_index=2)
              >>> ovr.ReadAsArray()  # doctest: +SKIP
              array([[6, 8],
                     [8, 5]], dtype=int32)
              >>> ovr = dataset.get_overview(band=0, overview_index=3)
              >>> ovr.ReadAsArray()  # doctest: +SKIP
              array([[6]], dtype=int32)
              ```
        See Also:
            - Dataset.create_overviews: Create the dataset overviews if they exist.
            - Dataset.create_overviews: Recreate the dataset overviews if they exist.
            - Dataset.overview_count: Number of overviews.
            - Dataset.read_overview_array: Read overview values.
            - Dataset.plot: Plot a band.
        """
        band_obj = self._iloc(band)
        n_views = band_obj.GetOverviewCount()
        if n_views == 0:
            raise ValueError(
                "The band has no overviews, please use the `create_overviews` method to build the overviews"
            )
        if overview_index >= n_views:
            raise ValueError(f"overview_level should be less than {n_views}")
        # TODO:find away to create a Dataset object from the overview band and to return the Dataset object instead
        #  of the gdal band.
        return band_obj.GetOverview(overview_index)
    def read_overview_array(
        self: Dataset, band: int | None = None, overview_index: int = 0
    ) -> np.ndarray:
        """Read overview values.
            - Read the values stored in a given band or overview.
        Args:
            band (int | None):
                The band to read. If None and multiple bands exist, reads all bands at the given overview.
            overview_index (int):
                Index of the overview. Defaults to 0.
        Returns:
            np.ndarray:
                Array with the values in the raster.
        Examples:
            - Create `Dataset` consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):
              ```python
              >>> import numpy as np
              >>> arr = np.random.randint(1, 10, size=(4, 10, 10))
              >>> print(arr[0, :, :])     # doctest: +SKIP
              array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
                     [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
                     [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
                     [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
                     [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
                     [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
                     [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
                     [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
                     [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
                     [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
              >>> top_left_corner = (0, 0)
              >>> cell_size = 0.05
              >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
              ```
            - Create overviews using the default parameters and read overview arrays:
              ```python
              >>> dataset.create_overviews()
              >>> print(dataset.overview_count)  # doctest: +SKIP
              [4, 4, 4, 4]
              >>> arr = dataset.read_overview_array(band=0, overview_index=0)
              >>> print(arr)  # doctest: +SKIP
              array([[6, 3, 4, 4, 8],
                     [5, 9, 7, 5, 6],
                     [5, 3, 1, 7, 5],
                     [1, 4, 2, 6, 9],
                     [4, 3, 6, 5, 8]], dtype=int32)
              >>> arr = dataset.read_overview_array(band=0, overview_index=1)
              >>> print(arr)  # doctest: +SKIP
              array([[6, 7, 3],
                     [2, 5, 6],
                     [6, 9, 9]], dtype=int32)
              >>> arr = dataset.read_overview_array(band=0, overview_index=2)
              >>> print(arr)  # doctest: +SKIP
              array([[6, 8],
                     [8, 5]], dtype=int32)
              >>> arr = dataset.read_overview_array(band=0, overview_index=3)
              >>> print(arr)  # doctest: +SKIP
              array([[6]], dtype=int32)
              ```
        See Also:
            - Dataset.create_overviews: Create the dataset overviews.
            - Dataset.create_overviews: Recreate the dataset overviews if they exist.
            - Dataset.get_overview: Get an overview of a band.
            - Dataset.overview_count: Number of overviews.
            - Dataset.plot: Plot a band.
        """
        if band is None and self.band_count > 1:
            if any(elem == 0 for elem in self.overview_count):
                raise ValueError(
                    "Some bands do not have overviews, please create overviews first"
                )
            # read the array from the first overview to get the size of the array.
            ovr_arr = np.asarray(self.get_overview(0, 0).ReadAsArray())
            arr: np.ndarray = np.ones(
                (
                    self.band_count,
                    ovr_arr.shape[0],
                    ovr_arr.shape[1],
                ),
                dtype=self.numpy_dtype[0],
            )
            for i in range(self.band_count):
                arr[i, :, :] = self.get_overview(i, overview_index).ReadAsArray()
        else:
            if band is None:
                band = 0
            else:
                if band > self.band_count - 1:
                    raise ValueError(
                        f"band index should be between 0 and {self.band_count - 1}"
                    )
                if self.overview_count[band] == 0:
                    raise ValueError(
                        f"band {band} has no overviews, please create overviews first"
                    )
            arr = np.asarray(self.get_overview(band, overview_index).ReadAsArray())
        return arr

overview_count property #

Number of the overviews for each band.

read_array(band=None, window=None) #

Read the values stored in a given band.

Data Chuncks/blocks When a raster dataset is stored on disk, it might not be stored as one continuous chunk of data. Instead, it can be divided into smaller rectangular blocks or tiles. These blocks can be individually accessed, which is particularly useful for large datasets:

    - Efficiency: Reading or writing small blocks requires less memory than dealing with the entire dataset
          at once. This is especially beneficial when only a small portion of the data needs to be processed.
    - Performance: For certain file formats and operations, working with optimal block sizes can significantly
          improve performance. For example, if the block size matches the reading or processing window,
          Pyramids can minimize disk access and data transfer.

Parameters:

Name Type Description Default
band int

The band you want to get its data. If None, data of all bands will be read. Default is None.

None
window List[int] | GeoDataFrame

Specify a block of data to read from the dataset. The window can be specified in two ways:

  • List: Window specified as a list of 4 integers [offset_x, offset_y, window_columns, window_rows].

    • offset_x/column index: x offset of the block.
    • offset_y/row index: y offset of the block.
    • window_columns: number of columns in the block.
    • window_rows: number of rows in the block.
  • GeoDataFrame: GeoDataFrame with a geometry column filled with polygon geometries; the function will get the total_bounds of the GeoDataFrame and use it as a window to read the raster.

None

Returns:

Type Description
ndarray

np.ndarray: array with all the values in the raster.

Examples:

  • Create Dataset consisting of 4 bands, 5 rows, and 5 columns at the point lon/lat (0, 0):
>>> import numpy as np
>>> arr = np.random.rand(4, 5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
  • Read all the values stored in a given band:
>>> arr = dataset.read_array(band=0) # doctest: +SKIP
array([[0.50482225, 0.45678043, 0.53294294, 0.28862223, 0.66753579],
       [0.38471912, 0.14617829, 0.05045189, 0.00761358, 0.25501918],
       [0.32689036, 0.37358843, 0.32233918, 0.75450564, 0.45197608],
       [0.22944676, 0.2780928 , 0.71605189, 0.71859309, 0.61896933],
       [0.47740168, 0.76490779, 0.07679277, 0.16142599, 0.73630836]])
  • Read a 2x2 block from the first band. The block starts at the 2nd column (index 1) and 2nd row (index 1) (the first index is the column index):
>>> arr = dataset.read_array(band=0, window=[1, 1, 2, 2])
>>> print(arr) # doctest: +SKIP
array([[0.14617829, 0.05045189],
       [0.37358843, 0.32233918]])
  • If you check the values of the 2x2 block, you will find them the same as the values in the entire array of band 0, starting at the 2nd row and 2nd column.

  • Read a block using a GeoDataFrame polygon that covers the same area as the window above:

>>> import geopandas as gpd
>>> from shapely.geometry import Polygon
>>> poly = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)
>>> arr = dataset.read_array(band=0, window=poly)
>>> print(arr) # doctest: +SKIP
array([[0.14617829, 0.05045189],
       [0.37358843, 0.32233918]])
See Also
  • Dataset.get_tile: Read the dataset in chunks.
  • Dataset.get_block_arrangement: Get block arrangement to read the dataset in chunks.
Source code in src/pyramids/dataset/ops/io.py
def read_array(
    self: Dataset, band: int | None = None, window: GeoDataFrame | list[int] | None = None
) -> np.ndarray:
    """Read the values stored in a given band.

    Data Chuncks/blocks
        When a raster dataset is stored on disk, it might not be stored as one continuous chunk of data. Instead,
        it can be divided into smaller rectangular blocks or tiles. These blocks can be individually accessed,
        which is particularly useful for large datasets:

            - Efficiency: Reading or writing small blocks requires less memory than dealing with the entire dataset
                  at once. This is especially beneficial when only a small portion of the data needs to be processed.
            - Performance: For certain file formats and operations, working with optimal block sizes can significantly
                  improve performance. For example, if the block size matches the reading or processing window,
                  Pyramids can minimize disk access and data transfer.

    Args:
        band (int, optional):
            The band you want to get its data. If None, data of all bands will be read. Default is None.
        window (List[int] | GeoDataFrame, optional):
            Specify a block of data to read from the dataset. The window can be specified in two ways:

            - List:
                Window specified as a list of 4 integers [offset_x, offset_y, window_columns, window_rows].

                - offset_x/column index: x offset of the block.
                - offset_y/row index: y offset of the block.
                - window_columns: number of columns in the block.
                - window_rows: number of rows in the block.

            - GeoDataFrame:
                GeoDataFrame with a geometry column filled with polygon geometries; the function will get the
                total_bounds of the GeoDataFrame and use it as a window to read the raster.

    Returns:
        np.ndarray:
            array with all the values in the raster.

    Examples:
        - Create `Dataset` consisting of 4 bands, 5 rows, and 5 columns at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)

          ```

        - Read all the values stored in a given band:

          ```python
          >>> arr = dataset.read_array(band=0) # doctest: +SKIP
          array([[0.50482225, 0.45678043, 0.53294294, 0.28862223, 0.66753579],
                 [0.38471912, 0.14617829, 0.05045189, 0.00761358, 0.25501918],
                 [0.32689036, 0.37358843, 0.32233918, 0.75450564, 0.45197608],
                 [0.22944676, 0.2780928 , 0.71605189, 0.71859309, 0.61896933],
                 [0.47740168, 0.76490779, 0.07679277, 0.16142599, 0.73630836]])

          ```

        - Read a 2x2 block from the first band. The block starts at the 2nd column (index 1) and 2nd row (index 1)
            (the first index is the column index):

          ```python
          >>> arr = dataset.read_array(band=0, window=[1, 1, 2, 2])
          >>> print(arr) # doctest: +SKIP
          array([[0.14617829, 0.05045189],
                 [0.37358843, 0.32233918]])

          ```

        - If you check the values of the 2x2 block, you will find them the same as the values in the entire array
            of band 0, starting at the 2nd row and 2nd column.

        - Read a block using a GeoDataFrame polygon that covers the same area as the window above:

          ```python
          >>> import geopandas as gpd
          >>> from shapely.geometry import Polygon
          >>> poly = gpd.GeoDataFrame(geometry=[Polygon([(0.1, -0.1), (0.1, -0.2), (0.2, -0.2), (0.2, -0.1)])], crs=4326)
          >>> arr = dataset.read_array(band=0, window=poly)
          >>> print(arr) # doctest: +SKIP
          array([[0.14617829, 0.05045189],
                 [0.37358843, 0.32233918]])

          ```

    See Also:
        - Dataset.get_tile: Read the dataset in chunks.
        - Dataset.get_block_arrangement: Get block arrangement to read the dataset in chunks.
    """
    if band is None and self.band_count > 1:
        rows = self.rows if window is None else window[3]
        columns = self.columns if window is None else window[2]
        arr = np.ones(
            (
                self.band_count,
                rows,
                columns,
            ),
            dtype=self.numpy_dtype[0],
        )

        for i in range(self.band_count):
            if window is None:
                # this line could be replaced with the following line
                # arr[i, :, :] = self._iloc(i).ReadAsArray()
                arr[i, :, :] = self._raster.GetRasterBand(i + 1).ReadAsArray()
            else:
                arr[i, :, :] = self._read_block(i, window)
    else:
        # given band number or the raster has only one band
        if band is None:
            band = 0
        else:
            if band > self.band_count - 1:
                raise ValueError(
                    f"band index should be between 0 and {self.band_count - 1}"
                )
        if window is None:
            arr = self._iloc(band).ReadAsArray()
        else:
            arr = self._read_block(band, window)

    return arr

write_array(array, top_left_corner) #

Write an array to the dataset at the given xoff, yoff position.

Parameters:

Name Type Description Default
array ndarray

The array to write

required
top_left_corner list[int]

indices [row, column]/[y_offset, x_offset] of the cell to write the array to.

required

Raises:

Type Description
Exception

If the array is not written successfully.

Hint
  • The Dataset has to be opened in a write mode read_only=False.

Returns: None

Examples:

  • First, create a dataset on disk:
>>> import numpy as np
>>> arr = np.random.rand(5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> path = 'write_array.tif'
>>> dataset = Dataset.create_from_array(
...     arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326, path=path
... )
>>> dataset = None
  • In a later session you can read the dataset in a write mode and update it:
>>> dataset = Dataset.read_file(path, read_only=False)
>>> arr = np.array([[1, 2], [3, 4]])
>>> dataset.write_array(arr, top_left_corner=[1, 1])
>>> dataset.read_array()    # doctest: +SKIP
array([[0.77359738, 0.64789596, 0.37912658, 0.03673771, 0.69571106],
       [0.60804387, 1.        , 2.        , 0.501909  , 0.99597122],
       [0.83879291, 3.        , 4.        , 0.33058081, 0.59824467],
       [0.774213  , 0.94338147, 0.16443719, 0.28041457, 0.61914179],
       [0.97201104, 0.81364799, 0.35157525, 0.65554998, 0.8589739 ]])
Source code in src/pyramids/dataset/ops/io.py
def write_array(self: Dataset, array: np.ndarray, top_left_corner: list[int]) -> None:
    """Write an array to the dataset at the given xoff, yoff position.

    Args:
        array (np.ndarray):
            The array to write
        top_left_corner (list[int]):
            indices [row, column]/[y_offset, x_offset] of the cell to write the array to.

    Raises:
        Exception: If the array is not written successfully.

    Hint:
        - The `Dataset` has to be opened in a write mode `read_only=False`.

    Returns:
    None

    Examples:
        - First, create a dataset on disk:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> path = 'write_array.tif'
          >>> dataset = Dataset.create_from_array(
          ...     arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326, path=path
          ... )
          >>> dataset = None

          ```

        - In a later session you can read the dataset in a `write` mode and update it:

          ```python
          >>> dataset = Dataset.read_file(path, read_only=False)
          >>> arr = np.array([[1, 2], [3, 4]])
          >>> dataset.write_array(arr, top_left_corner=[1, 1])
          >>> dataset.read_array()    # doctest: +SKIP
          array([[0.77359738, 0.64789596, 0.37912658, 0.03673771, 0.69571106],
                 [0.60804387, 1.        , 2.        , 0.501909  , 0.99597122],
                 [0.83879291, 3.        , 4.        , 0.33058081, 0.59824467],
                 [0.774213  , 0.94338147, 0.16443719, 0.28041457, 0.61914179],
                 [0.97201104, 0.81364799, 0.35157525, 0.65554998, 0.8589739 ]])

          ```
    """
    yoff, xoff = top_left_corner
    try:
        self._raster.WriteArray(array, xoff=xoff, yoff=yoff)
        self._raster.FlushCache()
    except Exception as e:
        raise e

get_block_arrangement(band=0, x_block_size=None, y_block_size=None) #

Get Block Arrangement.

Parameters:

Name Type Description Default
band int

band index, by default 0

0
x_block_size int

x block size/number of columns, by default None

None
y_block_size int

y block size/number of rows, by default None

None

Returns:

Name Type Description
DataFrame DataFrame

with the following columns: [x_offset, y_offset, window_xsize, window_ysize]

Examples:

  • Example of getting block arrangement:
>>> import numpy as np
>>> arr = np.random.rand(13, 14)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> df = dataset.get_block_arrangement(x_block_size=5, y_block_size=5)
>>> print(df)
   x_offset  y_offset  window_xsize  window_ysize
0         0         0             5             5
1         5         0             5             5
2        10         0             4             5
3         0         5             5             5
4         5         5             5             5
5        10         5             4             5
6         0        10             5             3
7         5        10             5             3
8        10        10             4             3
Source code in src/pyramids/dataset/ops/io.py
def get_block_arrangement(
    self: Dataset,
    band: int = 0,
    x_block_size: int | None = None,
    y_block_size: int | None = None,
) -> DataFrame:
    """Get Block Arrangement.

    Args:
        band (int, optional):
            band index, by default 0
        x_block_size (int, optional):
            x block size/number of columns, by default None
        y_block_size (int, optional):
            y block size/number of rows, by default None

    Returns:
        DataFrame:
            with the following columns: [x_offset, y_offset, window_xsize, window_ysize]

    Examples:
        - Example of getting block arrangement:

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(13, 14)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> df = dataset.get_block_arrangement(x_block_size=5, y_block_size=5)
          >>> print(df)
             x_offset  y_offset  window_xsize  window_ysize
          0         0         0             5             5
          1         5         0             5             5
          2        10         0             4             5
          3         0         5             5             5
          4         5         5             5             5
          5        10         5             4             5
          6         0        10             5             3
          7         5        10             5             3
          8        10        10             4             3

          ```
    """
    block_sizes = self.block_size[band]
    x_block_size = block_sizes[0] if x_block_size is None else x_block_size
    y_block_size = block_sizes[1] if y_block_size is None else y_block_size

    df = pd.DataFrame(
        [
            {
                "x_offset": x,
                "y_offset": y,
                "window_xsize": min(x_block_size, self.columns - x),
                "window_ysize": min(y_block_size, self.rows - y),
            }
            for y in range(0, self.rows, y_block_size)
            for x in range(0, self.columns, x_block_size)
        ],
        columns=["x_offset", "y_offset", "window_xsize", "window_ysize"],
    )
    return df

to_file(path, band=0, tile_length=None, creation_options=None) #

Save dataset to tiff file.

`to_file` saves a raster to disk, the type of the driver (georiff/netcdf/ascii) will be implied from the
extension at the end of the given path.

Parameters:

Name Type Description Default
path str

A path including the name of the dataset.

required
band int

Band index, needed only in case of ascii drivers. Default is 0.

0
tile_length int

Length of the tiles in the driver. Default is 256.

None
creation_options list[str] | None

List[str], Default is None List of strings that will be passed to the GDAL driver during the creation of the dataset. i.e., ['PREDICTOR=2']

None

Examples:

  • Create a Dataset with 4 bands, 5 rows, 5 columns, at the point lon/lat (0, 0):
>>> import numpy as np
>>> arr = np.random.rand(4, 5, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset.file_name)
<BLANKLINE>
  • Now save the dataset as a geotiff file:
>>> dataset.to_file("my-dataset.tif")
>>> print(dataset.file_name)
my-dataset.tif
Source code in src/pyramids/dataset/ops/io.py
def to_file(
    self: Dataset,
    path: str | Path,
    band: int = 0,
    tile_length: int | None = None,
    creation_options: list[str] | None = None,
) -> None:
    """Save dataset to tiff file.

        `to_file` saves a raster to disk, the type of the driver (georiff/netcdf/ascii) will be implied from the
        extension at the end of the given path.

    Args:
        path (str):
            A path including the name of the dataset.
        band (int):
            Band index, needed only in case of ascii drivers. Default is 0.
        tile_length (int, optional):
            Length of the tiles in the driver. Default is 256.
        creation_options: List[str], Default is None
            List of strings that will be passed to the GDAL driver during the creation of the dataset.
            i.e., ['PREDICTOR=2']

    Examples:
        - Create a Dataset with 4 bands, 5 rows, 5 columns, at the point lon/lat (0, 0):

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 5, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset.file_name)
          <BLANKLINE>

          ```

        - Now save the dataset as a geotiff file:

          ```python
          >>> dataset.to_file("my-dataset.tif")
          >>> print(dataset.file_name)
          my-dataset.tif

          ```
    """
    if not isinstance(path, (str, Path)):
        raise TypeError(
            f"path input should be string or Path type, given: {type(path)}"
        )

    path = Path(path)
    extension = path.suffix[1:]
    driver = CATALOG.get_driver_name_by_extension(extension)
    driver_name = CATALOG.get_gdal_name(driver)

    if driver == "ascii":
        arr = self.read_array(band=band)
        no_data_value = self.no_data_value[band]
        xmin, ymin, _, _ = self.bbox
        _io.to_ascii(arr, self.cell_size, xmin, ymin, no_data_value, path)
    else:
        # saving rasters with color table fails with a runtime error
        options = ["COMPRESS=DEFLATE"]
        if tile_length is not None:
            options += [
                "TILED=YES",
                f"TILE_LENGTH={tile_length}",
            ]
            if self._block_size is not None and self._block_size != []:
                options += [
                    "BLOCKXSIZE={}".format(self._block_size[0][0]),
                    "BLOCKYSIZE={}".format(self._block_size[0][1]),
                ]
        if creation_options is not None:
            options += creation_options

        try:
            self.raster.FlushCache()
            dst = gdal.GetDriverByName(driver_name).CreateCopy(
                str(path), self.raster, 0, options=options
            )
            self._update_inplace(dst, "write")
            # flush the data to the dataset on disk.
            dst.FlushCache()
        except RuntimeError:
            if not path.exists():
                raise FailedToSaveError(
                    f"Failed to save the {driver_name} raster to the path: {path}"
                )

get_tile(size=256) #

Get tile.

Parameters:

Name Type Description Default
size int

Size of the window in pixels. One value is required which is used for both the x and y size. e.g., 256 means a 256x256 window. Default is 256.

256

Yields:

Type Description
Generator[ndarray]

np.ndarray: Dataset array with a shape [band, y, x].

Examples:

  • First, we will create a dataset with 3 rows and 5 columns.

>>> import numpy as np
>>> arr = np.random.rand(3, 5)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
>>> print(dataset)
<BLANKLINE>
            Cell size: 0.05
            Dimension: 3 * 5
            EPSG: 4326
            Number of Bands: 1
            Band names: ['Band_1']
            Mask: -9999.0
            Data type: float64
            File:...
<BLANKLINE>

>>> print(dataset.read_array())   # doctest: +SKIP
[[0.55332314 0.48364841 0.67794589 0.6901816  0.70516817]
 [0.82518332 0.75657103 0.45693945 0.44331782 0.74677865]
 [0.22231314 0.96283065 0.15201337 0.03522544 0.44616888]]
- The get_tile method splits the domain into tiles of the specified size using the _tile_offsets function.

>>> tile_dimensions = list(dataset._tile_offsets(2))
>>> print(tile_dimensions)
[(0, 0, 2, 2), (2, 0, 2, 2), (4, 0, 1, 2), (0, 2, 2, 1), (2, 2, 2, 1), (4, 2, 1, 1)]
get_tile

  • So the first two chunks are 22, 21 chunk, then two 12 chunks, and the last chunk is 11.
  • The get_tile method returns a generator object that can be used to iterate over the smaller chunks of the data.
>>> tiles_generator = dataset.get_tile(size=2)
>>> print(tiles_generator)  # doctest: +SKIP
<generator object Dataset.get_tile at 0x00000145AA39E680>
>>> print(list(tiles_generator))  # doctest: +SKIP
[
    array([[0.55332314, 0.48364841],
           [0.82518332, 0.75657103]]),
    array([[0.67794589, 0.6901816 ],
           [0.45693945, 0.44331782]]),
    array([[0.70516817], [0.74677865]]),
    array([[0.22231314, 0.96283065]]),
    array([[0.15201337, 0.03522544]]),
    array([[0.44616888]])
]
Source code in src/pyramids/dataset/ops/io.py
def get_tile(self: Dataset, size=256) -> Generator[np.ndarray]:
    """Get tile.

    Args:
        size (int):
            Size of the window in pixels. One value is required which is used for both the x and y size. e.g., 256
            means a 256x256 window. Default is 256.

    Yields:
        np.ndarray:
            Dataset array with a shape `[band, y, x]`.

    Examples:
        - First, we will create a dataset with 3 rows and 5 columns.

          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(3, 5)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          >>> print(dataset)
          <BLANKLINE>
                      Cell size: 0.05
                      Dimension: 3 * 5
                      EPSG: 4326
                      Number of Bands: 1
                      Band names: ['Band_1']
                      Mask: -9999.0
                      Data type: float64
                      File:...
          <BLANKLINE>

          >>> print(dataset.read_array())   # doctest: +SKIP
          [[0.55332314 0.48364841 0.67794589 0.6901816  0.70516817]
           [0.82518332 0.75657103 0.45693945 0.44331782 0.74677865]
           [0.22231314 0.96283065 0.15201337 0.03522544 0.44616888]]

          ```
        - The `get_tile` method splits the domain into tiles of the specified `size` using the `_tile_offsets` function.

          ```python
          >>> tile_dimensions = list(dataset._tile_offsets(2))
          >>> print(tile_dimensions)
          [(0, 0, 2, 2), (2, 0, 2, 2), (4, 0, 1, 2), (0, 2, 2, 1), (2, 2, 2, 1), (4, 2, 1, 1)]

          ```
          ![get_tile](./../../_images/dataset/get_tile.png)

        - So the first two chunks are 2*2, 2*1 chunk, then two 1*2 chunks, and the last chunk is 1*1.
        - The `get_tile` method returns a generator object that can be used to iterate over the smaller chunks of
            the data.

          ```python
          >>> tiles_generator = dataset.get_tile(size=2)
          >>> print(tiles_generator)  # doctest: +SKIP
          <generator object Dataset.get_tile at 0x00000145AA39E680>
          >>> print(list(tiles_generator))  # doctest: +SKIP
          [
              array([[0.55332314, 0.48364841],
                     [0.82518332, 0.75657103]]),
              array([[0.67794589, 0.6901816 ],
                     [0.45693945, 0.44331782]]),
              array([[0.70516817], [0.74677865]]),
              array([[0.22231314, 0.96283065]]),
              array([[0.15201337, 0.03522544]]),
              array([[0.44616888]])
          ]

          ```
    """
    for xoff, yoff, xsize, ysize in self._tile_offsets(size=size):
        # read the array at certain indices
        yield self.raster.ReadAsArray(
            xoff=xoff, yoff=yoff, xsize=xsize, ysize=ysize
        )

map_blocks(func, tile_size=256, band=None) #

Apply a function block-by-block without loading the full raster into memory.

Reads the raster in tiles of tile_size x tile_size, applies func to each tile, and writes the result to a new in-memory Dataset. Neither the input nor the output array needs to fit in memory at once — only one tile at a time.

This is the key enabler for processing rasters larger than RAM.

Parameters:

Name Type Description Default
func Callable[[ndarray], ndarray]

A function that takes a numpy array (the tile) and returns a numpy array of the same shape. The function should handle no-data values internally if needed.

required
tile_size int

Size of each square tile in pixels. Default is 256.

256
band int | None

Band index to process. If None, all bands are processed. Default is None.

None

Returns:

Name Type Description
Dataset Dataset

A new Dataset with the function applied to every tile.

Examples:

  • Apply a function block-by-block to avoid loading a large raster into memory:
>>> import numpy as np
>>> arr = np.arange(1, 101, dtype=np.float32).reshape(10, 10)
>>> dataset = Dataset.create_from_array(
...     arr, top_left_corner=(0, 0), cell_size=1.0, epsg=4326
... )
>>> result = dataset.map_blocks(lambda tile: tile * 2, tile_size=5)
>>> print(result.read_array()[0, 0])
2.0
Source code in src/pyramids/dataset/ops/io.py
def map_blocks(
    self: Dataset,
    func: Callable[[np.ndarray], np.ndarray],
    tile_size: int = 256,
    band: int | None = None,
) -> Dataset:
    """Apply a function block-by-block without loading the full raster into memory.

    Reads the raster in tiles of `tile_size x tile_size`, applies `func` to each
    tile, and writes the result to a new in-memory Dataset. Neither the input nor
    the output array needs to fit in memory at once — only one tile at a time.

    This is the key enabler for processing rasters larger than RAM.

    Args:
        func (Callable[[np.ndarray], np.ndarray]):
            A function that takes a numpy array (the tile) and returns a numpy array
            of the same shape. The function should handle no-data values internally
            if needed.
        tile_size (int):
            Size of each square tile in pixels. Default is 256.
        band (int | None):
            Band index to process. If None, all bands are processed. Default is None.

    Returns:
        Dataset:
            A new Dataset with the function applied to every tile.

    Examples:
        - Apply a function block-by-block to avoid loading a large raster into memory:

          ```python
          >>> import numpy as np
          >>> arr = np.arange(1, 101, dtype=np.float32).reshape(10, 10)
          >>> dataset = Dataset.create_from_array(
          ...     arr, top_left_corner=(0, 0), cell_size=1.0, epsg=4326
          ... )
          >>> result = dataset.map_blocks(lambda tile: tile * 2, tile_size=5)
          >>> print(result.read_array()[0, 0])
          2.0

          ```
    """
    if band is not None:
        bands = 1
        dtype = self.gdal_dtype[band]
    else:
        bands = self.band_count
        dtype = self.gdal_dtype[0]

    if band is not None:
        no_data = [self.no_data_value[band]]
    else:
        no_data = self.no_data_value

    dst_obj = type(self)._build_dataset(
        self.columns, self.rows, bands, dtype,
        self.geotransform, self.crs, no_data,
    )

    for xoff, yoff, xsize, ysize in self._tile_offsets(size=tile_size):
        if band is not None:
            tile = self._iloc(band).ReadAsArray(xoff, yoff, xsize, ysize)
            result_tile = func(np.asarray(tile))
            dst_obj.raster.GetRasterBand(1).WriteArray(result_tile, xoff, yoff)
        else:
            for b in range(self.band_count):
                tile = self._raster.GetRasterBand(b + 1).ReadAsArray(
                    xoff, yoff, xsize, ysize
                )
                result_tile = func(np.asarray(tile))
                dst_obj.raster.GetRasterBand(b + 1).WriteArray(
                    result_tile, xoff, yoff
                )

    return dst_obj

to_xyz(bands=None, path=None) #

Convert to XYZ.

Parameters:

Name Type Description Default
path str

path to the file where the data will be saved. If None, the data will be returned as a DataFrame. default is None.

None
bands List[int]

indices of the bands. If None, all bands will be used. default is None

None

Returns:

Type Description
DataFrame | None

DataFrame/File: DataFrame with columns: lon, lat, band_1, band_2,... . If a path is provided the data will be saved to disk as a .xyz file

Examples:

  • First we will create a dataset from a float32 array with values between 1 and 10, and then we will assign a scale of 0.1 to the dataset.
    >>> import numpy as np
    >>> arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
    >>> top_left_corner = (0, 0)
    >>> cell_size = 0.05
    >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
    >>> print(dataset)
    <BLANKLINE>
                Top Left Corner: (0.0, 0.0)
                Cell size: 0.05
                Dimension: 2 * 2
                EPSG: 4326
                Number of Bands: 2
                Band names: ['Band_1', 'Band_2']
                Band colors: {0: 'undefined', 1: 'undefined'}
                Band units: ['', '']
                Scale: [1.0, 1.0]
                Offset: [0, 0]
                Mask: -9999.0
                Data type: int64
                File: ...
    <BLANKLINE>
    >>> df = dataset.to_xyz()
    >>> print(df)
         lon    lat  Band_1  Band_2
    0  0.025 -0.025       1       5
    1  0.075 -0.025       2       6
    2  0.025 -0.075       3       7
    3  0.075 -0.075       4       8
    
Source code in src/pyramids/dataset/ops/io.py
def to_xyz(
    self: Dataset, bands: list[int] | None = None, path: str | Path | None = None
) -> DataFrame | None:
    """Convert to XYZ.

    Args:
        path (str, optional):
            path to the file where the data will be saved. If None, the data will be returned as a DataFrame.
            default is None.
        bands (List[int], optional):
            indices of the bands. If None, all bands will be used. default is None

    Returns:
        DataFrame/File:
            DataFrame with columns: lon, lat, band_1, band_2,... . If a path is provided the data will be saved to
            disk as a .xyz file

    Examples:
        - First we will create a dataset from a float32 array with values between 1 and 10, and then we will
            assign a scale of 0.1 to the dataset.
            ```python
            >>> import numpy as np
            >>> arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
            >>> top_left_corner = (0, 0)
            >>> cell_size = 0.05
            >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size,epsg=4326)
            >>> print(dataset)
            <BLANKLINE>
                        Top Left Corner: (0.0, 0.0)
                        Cell size: 0.05
                        Dimension: 2 * 2
                        EPSG: 4326
                        Number of Bands: 2
                        Band names: ['Band_1', 'Band_2']
                        Band colors: {0: 'undefined', 1: 'undefined'}
                        Band units: ['', '']
                        Scale: [1.0, 1.0]
                        Offset: [0, 0]
                        Mask: -9999.0
                        Data type: int64
                        File: ...
            <BLANKLINE>
            >>> df = dataset.to_xyz()
            >>> print(df)
                 lon    lat  Band_1  Band_2
            0  0.025 -0.025       1       5
            1  0.075 -0.025       2       6
            2  0.025 -0.075       3       7
            3  0.075 -0.075       4       8
            ```
    """
    if bands is None:
        bands = range(1, self.band_count + 1)
    elif isinstance(bands, int):
        bands = [bands + 1]
    elif isinstance(bands, list):
        bands = [band + 1 for band in bands]
    else:
        raise ValueError("bands must be an integer or a list of integers.")

    band_nums = bands
    arr = gdal2xyz.gdal2xyz(
        self.raster,
        str(path) if path is not None else None,
        skip_nodata=True,
        return_np_arrays=True,
        band_nums=band_nums,
    )
    if path is None:
        band_names = []
        if bands is not None:
            for band in bands:
                band_names.append(self.band_names[band - 1])
        else:
            band_names = self.band_names

        df = pd.DataFrame(columns=["lon", "lat"] + band_names)
        df["lon"] = arr[0]
        df["lat"] = arr[1]
        df[band_names] = arr[2].transpose()
        result = df
    else:
        result = None
    return result

create_overviews(resampling_method='nearest', overview_levels=None) #

Create overviews for the dataset. Args: resampling_method (str): The resampling method used to create the overviews. Possible values are "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE", "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest". overview_levels (list, optional): The overview levels. Restricted to typical power-of-two reduction factors. Defaults to [2, 4, 8, 16, 32]. Returns: None: Creates internal or external overviews depending on the dataset access mode. See Notes. Notes: - External (.ovr file): If the dataset is read with read_only=True then the overviews file will be created as an external .ovr file in the same directory of the dataset. - Internal: If the dataset is read with read_only=False then the overviews will be created internally in the dataset, and the dataset needs to be saved/flushed to persist the changes to disk. - You can check the count per band via the overview_count property. Examples: - Create a Dataset with 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.rand(4, 10, 10)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
- Now, create overviews using the default parameters:
>>> dataset.create_overviews()
>>> print(dataset.overview_count)  # doctest: +SKIP
[4, 4, 4, 4]
- For each band, there are 4 overview levels you can use to plot the bands:
>>> dataset.plot(band=0, overview=True, overview_index=0) # doctest: +SKIP
overviews-level-0 - However, the dataset originally is 10*10, but the first overview level (2) displays half of the cells by aggregating all the cells using the nearest neighbor. The second level displays only 3 cells in each:
>>> dataset.plot(band=0, overview=True, overview_index=1)   # doctest: +SKIP
overviews-level-1 - For the third overview level:
>>> dataset.plot(band=0, overview=True, overview_index=2)       # doctest: +SKIP
overviews-level-2 See Also: - Dataset.recreate_overviews: Recreate the dataset overviews if they exist - Dataset.get_overview: Get an overview of a band - Dataset.overview_count: Number of overviews - Dataset.read_overview_array: Read overview values - Dataset.plot: Plot a band

Source code in src/pyramids/dataset/ops/io.py
def create_overviews(
    self: Dataset,
    resampling_method: str = "nearest",
    overview_levels: list | None = None,
) -> None:
    """Create overviews for the dataset.
    Args:
        resampling_method (str):
            The resampling method used to create the overviews. Possible values are
            "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE",
            "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".
        overview_levels (list, optional):
            The overview levels. Restricted to typical power-of-two reduction factors. Defaults to [2, 4, 8, 16,
            32].
    Returns:
        None:
            Creates internal or external overviews depending on the dataset access mode. See Notes.
    Notes:
        - External (.ovr file): If the dataset is read with `read_only=True` then the overviews file will be created
          as an external .ovr file in the same directory of the dataset.
        - Internal: If the dataset is read with `read_only=False` then the overviews will be created internally in
          the dataset, and the dataset needs to be saved/flushed to persist the changes to disk.
        - You can check the count per band via the `overview_count` property.
    Examples:
        - Create a Dataset with 4 bands, 10 rows, 10 columns, at the point lon/lat (0, 0):
          ```python
          >>> import numpy as np
          >>> arr = np.random.rand(4, 10, 10)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          ```
        - Now, create overviews using the default parameters:
          ```python
          >>> dataset.create_overviews()
          >>> print(dataset.overview_count)  # doctest: +SKIP
          [4, 4, 4, 4]
          ```
        - For each band, there are 4 overview levels you can use to plot the bands:
          ```python
          >>> dataset.plot(band=0, overview=True, overview_index=0) # doctest: +SKIP
          ```
          ![overviews-level-0](./../../_images/dataset/overviews-level-0.png)
        - However, the dataset originally is 10*10, but the first overview level (2) displays half of the cells by
          aggregating all the cells using the nearest neighbor. The second level displays only 3 cells in each:
          ```python
          >>> dataset.plot(band=0, overview=True, overview_index=1)   # doctest: +SKIP
          ```
          ![overviews-level-1](./../../_images/dataset/overviews-level-1.png)
        - For the third overview level:
          ```python
          >>> dataset.plot(band=0, overview=True, overview_index=2)       # doctest: +SKIP
          ```
          ![overviews-level-2](./../../_images/dataset/overviews-level-2.png)
    See Also:
        - Dataset.recreate_overviews: Recreate the dataset overviews if they exist
        - Dataset.get_overview: Get an overview of a band
        - Dataset.overview_count: Number of overviews
        - Dataset.read_overview_array: Read overview values
        - Dataset.plot: Plot a band
    """
    if overview_levels is None:
        overview_levels = OVERVIEW_LEVELS
    else:
        if not isinstance(overview_levels, list):
            raise TypeError("overview_levels should be a list")
        # if self.raster.HasArbitraryOverviews():
        if not all(elem in OVERVIEW_LEVELS for elem in overview_levels):
            raise ValueError(
                "overview_levels are restricted to the typical power-of-two reduction factors "
                "(like 2, 4, 8, 16, etc.)"
            )
    if resampling_method.upper() not in RESAMPLING_METHODS:
        raise ValueError(f"resampling_method should be one of {RESAMPLING_METHODS}")
    # Define the overview levels (the reduction factor).
    # e.g., 2 means the overview will be half the resolution of the original dataset.
    # Build overviews using nearest neighbor resampling
    # NEAREST is the resampling method used. Other methods include AVERAGE, GAUSS, etc.
    self.raster.BuildOverviews(resampling_method, overview_levels)

recreate_overviews(resampling_method='nearest') #

Recreate overviews for the dataset. Args: resampling_method (str): Resampling method used to recreate overviews. Possible values are "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE", "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest". Raises: ValueError: If resampling_method is not one of the allowed values above. ReadOnlyError: If overviews are internal and the dataset is opened read-only. Read with read_only=False. See Also: - Dataset.create_overviews: Recreate the dataset overviews if they exist. - Dataset.get_overview: Get an overview of a band. - Dataset.overview_count: Number of overviews. - Dataset.read_overview_array: Read overview values. - Dataset.plot: Plot a band.

Source code in src/pyramids/dataset/ops/io.py
def recreate_overviews(self: Dataset, resampling_method: str = "nearest") -> None:
    """Recreate overviews for the dataset.
    Args:
        resampling_method (str): Resampling method used to recreate overviews. Possible values are
            "NEAREST", "CUBIC", "AVERAGE", "GAUSS", "CUBICSPLINE", "LANCZOS", "MODE",
            "AVERAGE_MAGPHASE", "RMS", "BILINEAR". Defaults to "nearest".
    Raises:
        ValueError:
            If resampling_method is not one of the allowed values above.
        ReadOnlyError:
            If overviews are internal and the dataset is opened read-only. Read with read_only=False.
    See Also:
        - Dataset.create_overviews: Recreate the dataset overviews if they exist.
        - Dataset.get_overview: Get an overview of a band.
        - Dataset.overview_count: Number of overviews.
        - Dataset.read_overview_array: Read overview values.
        - Dataset.plot: Plot a band.
    """
    if resampling_method.upper() not in RESAMPLING_METHODS:
        raise ValueError(f"resampling_method should be one of {RESAMPLING_METHODS}")
    # Build overviews using nearest neighbor resampling
    # nearest is the resampling method used. Other methods include AVERAGE, GAUSS, etc.
    try:
        for i in range(self.band_count):
            band = self._iloc(i)
            for j in range(self.overview_count[i]):
                ovr = self.get_overview(i, j)
                # TODO: if this method takes a long time, we can use the gdal.RegenerateOverviews() method
                #  which is faster but it does not give the option to choose the resampling method. and the
                #  overviews has to be given to the function as a list.
                #  overviews = [band.GetOverview(i) for i in range(band.GetOverviewCount())]
                #  band.RegenerateOverviews(overviews) or gdal.RegenerateOverviews(overviews)
                gdal.RegenerateOverview(band, ovr, resampling_method)
    except RuntimeError:
        raise ReadOnlyError(
            "The Dataset is opened with a read only. Please read the dataset using read_only=False"
        )

get_overview(band=0, overview_index=0) #

Get an overview of a band. Args: band (int): The band index. Defaults to 0. overview_index (int): Index of the overview. Defaults to 0. Returns: gdal.Band: GDAL band object. Examples: - Create Dataset consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.randint(1, 10, size=(4, 10, 10))
>>> print(arr[0, :, :]) # doctest: +SKIP
array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
       [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
       [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
       [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
       [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
       [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
       [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
       [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
       [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
       [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
- Now, create overviews using the default parameters and inspect them:
>>> dataset.create_overviews()
>>> print(dataset.overview_count)  # doctest: +SKIP
[4, 4, 4, 4]
>>> ovr = dataset.get_overview(band=0, overview_index=0)
>>> print(ovr)  # doctest: +SKIP
<osgeo.gdal.Band; proxy of <Swig Object of type 'GDALRasterBandShadow *' at 0x0000017E2B5AF1B0> >
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6, 3, 4, 4, 8],
       [5, 9, 7, 5, 6],
       [5, 3, 1, 7, 5],
       [1, 4, 2, 6, 9],
       [4, 3, 6, 5, 8]], dtype=int32)
>>> ovr = dataset.get_overview(band=0, overview_index=1)
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6, 7, 3],
       [2, 5, 6],
       [6, 9, 9]], dtype=int32)
>>> ovr = dataset.get_overview(band=0, overview_index=2)
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6, 8],
       [8, 5]], dtype=int32)
>>> ovr = dataset.get_overview(band=0, overview_index=3)
>>> ovr.ReadAsArray()  # doctest: +SKIP
array([[6]], dtype=int32)
See Also: - Dataset.create_overviews: Create the dataset overviews if they exist. - Dataset.create_overviews: Recreate the dataset overviews if they exist. - Dataset.overview_count: Number of overviews. - Dataset.read_overview_array: Read overview values. - Dataset.plot: Plot a band.

Source code in src/pyramids/dataset/ops/io.py
def get_overview(self: Dataset, band: int = 0, overview_index: int = 0) -> gdal.Band:
    """Get an overview of a band.
    Args:
        band (int):
            The band index. Defaults to 0.
        overview_index (int):
            Index of the overview. Defaults to 0.
    Returns:
        gdal.Band:
            GDAL band object.
    Examples:
        - Create `Dataset` consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):
          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 10, size=(4, 10, 10))
          >>> print(arr[0, :, :]) # doctest: +SKIP
          array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
                 [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
                 [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
                 [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
                 [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
                 [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
                 [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
                 [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
                 [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
                 [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          ```
        - Now, create overviews using the default parameters and inspect them:
          ```python
          >>> dataset.create_overviews()
          >>> print(dataset.overview_count)  # doctest: +SKIP
          [4, 4, 4, 4]
          >>> ovr = dataset.get_overview(band=0, overview_index=0)
          >>> print(ovr)  # doctest: +SKIP
          <osgeo.gdal.Band; proxy of <Swig Object of type 'GDALRasterBandShadow *' at 0x0000017E2B5AF1B0> >
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6, 3, 4, 4, 8],
                 [5, 9, 7, 5, 6],
                 [5, 3, 1, 7, 5],
                 [1, 4, 2, 6, 9],
                 [4, 3, 6, 5, 8]], dtype=int32)
          >>> ovr = dataset.get_overview(band=0, overview_index=1)
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6, 7, 3],
                 [2, 5, 6],
                 [6, 9, 9]], dtype=int32)
          >>> ovr = dataset.get_overview(band=0, overview_index=2)
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6, 8],
                 [8, 5]], dtype=int32)
          >>> ovr = dataset.get_overview(band=0, overview_index=3)
          >>> ovr.ReadAsArray()  # doctest: +SKIP
          array([[6]], dtype=int32)
          ```
    See Also:
        - Dataset.create_overviews: Create the dataset overviews if they exist.
        - Dataset.create_overviews: Recreate the dataset overviews if they exist.
        - Dataset.overview_count: Number of overviews.
        - Dataset.read_overview_array: Read overview values.
        - Dataset.plot: Plot a band.
    """
    band_obj = self._iloc(band)
    n_views = band_obj.GetOverviewCount()
    if n_views == 0:
        raise ValueError(
            "The band has no overviews, please use the `create_overviews` method to build the overviews"
        )
    if overview_index >= n_views:
        raise ValueError(f"overview_level should be less than {n_views}")
    # TODO:find away to create a Dataset object from the overview band and to return the Dataset object instead
    #  of the gdal band.
    return band_obj.GetOverview(overview_index)

read_overview_array(band=None, overview_index=0) #

Read overview values. - Read the values stored in a given band or overview. Args: band (int | None): The band to read. If None and multiple bands exist, reads all bands at the given overview. overview_index (int): Index of the overview. Defaults to 0. Returns: np.ndarray: Array with the values in the raster. Examples: - Create Dataset consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):

>>> import numpy as np
>>> arr = np.random.randint(1, 10, size=(4, 10, 10))
>>> print(arr[0, :, :])     # doctest: +SKIP
array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
       [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
       [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
       [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
       [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
       [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
       [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
       [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
       [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
       [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
>>> top_left_corner = (0, 0)
>>> cell_size = 0.05
>>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
- Create overviews using the default parameters and read overview arrays:
>>> dataset.create_overviews()
>>> print(dataset.overview_count)  # doctest: +SKIP
[4, 4, 4, 4]
>>> arr = dataset.read_overview_array(band=0, overview_index=0)
>>> print(arr)  # doctest: +SKIP
array([[6, 3, 4, 4, 8],
       [5, 9, 7, 5, 6],
       [5, 3, 1, 7, 5],
       [1, 4, 2, 6, 9],
       [4, 3, 6, 5, 8]], dtype=int32)
>>> arr = dataset.read_overview_array(band=0, overview_index=1)
>>> print(arr)  # doctest: +SKIP
array([[6, 7, 3],
       [2, 5, 6],
       [6, 9, 9]], dtype=int32)
>>> arr = dataset.read_overview_array(band=0, overview_index=2)
>>> print(arr)  # doctest: +SKIP
array([[6, 8],
       [8, 5]], dtype=int32)
>>> arr = dataset.read_overview_array(band=0, overview_index=3)
>>> print(arr)  # doctest: +SKIP
array([[6]], dtype=int32)
See Also: - Dataset.create_overviews: Create the dataset overviews. - Dataset.create_overviews: Recreate the dataset overviews if they exist. - Dataset.get_overview: Get an overview of a band. - Dataset.overview_count: Number of overviews. - Dataset.plot: Plot a band.

Source code in src/pyramids/dataset/ops/io.py
def read_overview_array(
    self: Dataset, band: int | None = None, overview_index: int = 0
) -> np.ndarray:
    """Read overview values.
        - Read the values stored in a given band or overview.
    Args:
        band (int | None):
            The band to read. If None and multiple bands exist, reads all bands at the given overview.
        overview_index (int):
            Index of the overview. Defaults to 0.
    Returns:
        np.ndarray:
            Array with the values in the raster.
    Examples:
        - Create `Dataset` consisting of 4 bands, 10 rows, 10 columns, at lon/lat (0, 0):
          ```python
          >>> import numpy as np
          >>> arr = np.random.randint(1, 10, size=(4, 10, 10))
          >>> print(arr[0, :, :])     # doctest: +SKIP
          array([[6, 3, 3, 7, 4, 8, 4, 3, 8, 7],
                 [6, 7, 3, 7, 8, 6, 3, 4, 3, 8],
                 [5, 8, 9, 6, 7, 7, 5, 4, 6, 4],
                 [2, 9, 9, 5, 8, 4, 9, 6, 8, 7],
                 [5, 8, 3, 9, 1, 5, 7, 9, 5, 9],
                 [8, 3, 7, 2, 2, 5, 2, 8, 7, 7],
                 [1, 1, 4, 2, 2, 2, 6, 5, 9, 2],
                 [6, 3, 2, 9, 8, 8, 1, 9, 7, 7],
                 [4, 1, 3, 1, 6, 7, 5, 4, 8, 7],
                 [9, 7, 2, 1, 4, 6, 1, 2, 3, 3]], dtype=int32)
          >>> top_left_corner = (0, 0)
          >>> cell_size = 0.05
          >>> dataset = Dataset.create_from_array(arr, top_left_corner=top_left_corner, cell_size=cell_size, epsg=4326)
          ```
        - Create overviews using the default parameters and read overview arrays:
          ```python
          >>> dataset.create_overviews()
          >>> print(dataset.overview_count)  # doctest: +SKIP
          [4, 4, 4, 4]
          >>> arr = dataset.read_overview_array(band=0, overview_index=0)
          >>> print(arr)  # doctest: +SKIP
          array([[6, 3, 4, 4, 8],
                 [5, 9, 7, 5, 6],
                 [5, 3, 1, 7, 5],
                 [1, 4, 2, 6, 9],
                 [4, 3, 6, 5, 8]], dtype=int32)
          >>> arr = dataset.read_overview_array(band=0, overview_index=1)
          >>> print(arr)  # doctest: +SKIP
          array([[6, 7, 3],
                 [2, 5, 6],
                 [6, 9, 9]], dtype=int32)
          >>> arr = dataset.read_overview_array(band=0, overview_index=2)
          >>> print(arr)  # doctest: +SKIP
          array([[6, 8],
                 [8, 5]], dtype=int32)
          >>> arr = dataset.read_overview_array(band=0, overview_index=3)
          >>> print(arr)  # doctest: +SKIP
          array([[6]], dtype=int32)
          ```
    See Also:
        - Dataset.create_overviews: Create the dataset overviews.
        - Dataset.create_overviews: Recreate the dataset overviews if they exist.
        - Dataset.get_overview: Get an overview of a band.
        - Dataset.overview_count: Number of overviews.
        - Dataset.plot: Plot a band.
    """
    if band is None and self.band_count > 1:
        if any(elem == 0 for elem in self.overview_count):
            raise ValueError(
                "Some bands do not have overviews, please create overviews first"
            )
        # read the array from the first overview to get the size of the array.
        ovr_arr = np.asarray(self.get_overview(0, 0).ReadAsArray())
        arr: np.ndarray = np.ones(
            (
                self.band_count,
                ovr_arr.shape[0],
                ovr_arr.shape[1],
            ),
            dtype=self.numpy_dtype[0],
        )
        for i in range(self.band_count):
            arr[i, :, :] = self.get_overview(i, overview_index).ReadAsArray()
    else:
        if band is None:
            band = 0
        else:
            if band > self.band_count - 1:
                raise ValueError(
                    f"band index should be between 0 and {self.band_count - 1}"
                )
            if self.overview_count[band] == 0:
                raise ValueError(
                    f"band {band} has no overviews, please create overviews first"
                )
        arr = np.asarray(self.get_overview(band, overview_index).ReadAsArray())
    return arr