Source code for iblutil.io.binary

from io import IOBase
from os import PathLike
from pathlib import Path
from typing import BinaryIO

import numpy as np
import numpy.typing as npt
import pandas as pd


[docs] def load_as_dataframe( filepath_bin: PathLike | str, dtype: np.dtype, count: int = -1, offset: int = 0, ) -> pd.DataFrame: """ Load a binary file into a pandas DataFrame using a specified NumPy structured data type. Parameters ---------- filepath_bin : Path or str The path to the binary file to be loaded. Can be a string or a Path object. dtype : np.dtype A NumPy structured data type that defines the format of the data in the binary file. Must be a structured datatype with fields. count : int, optional The number of items to read from the binary file. Default is -1, which means all items. offset : int, optional The number of bytes to skip at the beginning of the file before reading data. Default is 0. Returns ------- pd.DataFrame A pandas DataFrame containing the data read from the binary file. Raises ------ FileNotFoundError If the specified binary file does not exist. IsADirectoryError If the specified path is a directory instead of a file. ValueError If the provided dtype is not a NumPy structured datatype. """ filepath_bin = Path(filepath_bin) if not filepath_bin.exists(): raise FileNotFoundError(filepath_bin) if filepath_bin.is_dir(): raise IsADirectoryError(filepath_bin) if not isinstance(dtype, np.dtype) or not hasattr(dtype, 'fields') or dtype.fields is None: raise ValueError('dtype must be a NumPy structured datatype') structured_array = np.fromfile(file=filepath_bin, dtype=dtype, count=count, offset=offset) return pd.DataFrame(structured_array)
[docs] def convert_to_parquet( filepath_bin: PathLike | str, dtype: np.dtype, delete_bin_file: bool = False, ) -> Path: """ Convert a binary file to a Parquet file using a specified NumPy structured data type. Parameters ---------- filepath_bin : Path or str The path to the binary file to be converted. Can be a string or a Path object. dtype : np.dtype A NumPy structured data type that defines the format of the data in the binary file. Must be a structured datatype with fields. delete_bin_file : bool, optional If True, the original binary file will be deleted after conversion. Default is False. Returns ------- Path The path to the newly created Parquet file. The new filename will be constructed from the original filename and a '.pqt' suffix. Raises ------ FileNotFoundError If the specified binary file does not exist. FileExistsError If the output file already exists. IsADirectoryError If the specified path is a directory instead of a file. ValueError If the provided dtype is not a NumPy structured datatype. """ dataframe = load_as_dataframe(filepath_bin=filepath_bin, dtype=dtype) filepath_bin = Path(filepath_bin) filepath_pqt = filepath_bin.with_suffix('.pqt') if filepath_pqt.exists(): raise FileExistsError(filepath_pqt) dataframe.to_parquet(filepath_pqt) if delete_bin_file: filepath_bin.unlink() return filepath_pqt
[docs] def write_array(fid: BinaryIO | str | PathLike, array: npt.ArrayLike, dtype: np.dtype): """ Write a structured NumPy array to a binary file. Parameters ---------- fid : bytes, str, IO The file path or file-like object where the structured array will be written. array : npt.ArrayLike The input array to be written. It must have a maximum of two dimensions, and the last dimension must match the number of fields in the provided dtype. dtype : np.dtype A structured NumPy datatype that defines the fields of the array. It must be a valid structured dtype with fields. Raises ------ ValueError If `dtype` is not a structured NumPy datatype. If the input `array` has more than two dimensions. If the last dimension of `array` does not match the number of fields in `dtype`. FileExistsError If `fid` represents a Path and the respective file already exists. TypeError If `fid` is not a stream and cannot be converted to a Path. """ if not isinstance(dtype, np.dtype) or not hasattr(dtype, 'fields') or dtype.fields is None: raise ValueError("'dtype' must be a structured NumPy datatype") array = np.array(array) if array.ndim > 2: raise ValueError('The array must have a maximum of two dimensions.') if array.shape[-1] != len(dtype.fields): raise ValueError("The array's last dimension must match the number of fields in 'dtype'.") if not isinstance(fid, IOBase) and Path(fid).exists(): # type: ignore raise FileExistsError(fid) structured_data = np.rec.fromrecords(array).astype(dtype) structured_data.tofile(fid)