from pathlib import Path
from typing import Union, Optional, Dict, List, Tuple, Callable
from typing_extensions import Self
import pandas as pd
import numpy as np
from copy import deepcopy
from ocpy.model_data import DataModel
from ocpy.custom_types import BinarySeq
from ocpy.utils import Fixer
from .errors import LengthCheckError
from .oc import OC
from .oc_lmfit import OCLMFit
from .oc_pymc import OCPyMC
[docs]
class Data(DataModel):
"""
Container for eclipse minimum timing data.
The `Data` class stores and manages observational times of minima
(e.g., eclipse timings of binary stars) together with optional
metadata such as timing uncertainties, observational weights,
minimum type (primary/secondary), and labels.
Internally the data are stored in a :class:`pandas.DataFrame`
with standardized column names. The class provides utilities
for safely manipulating the dataset, including:
* Filling or computing timing uncertainties and weights
* Loading data from files
* Computing O–C (Observed minus Calculated) values
* Grouping or merging datasets
The class is designed to behave like a lightweight table object
while preserving domain-specific semantics required for
O–C analysis of eclipsing binaries.
Parameters
----------
minimum_time : list-like
Times of observed minima (typically in Julian Date or BJD).
This field is required.
minimum_time_error : list-like, optional
Uncertainties associated with each minimum time. If provided,
the length must match ``minimum_time``.
weights : list-like, optional
Weights assigned to each observation.
minimum_type : BinarySeq, optional
Indicator of minimum type (e.g., primary or secondary eclipse).
Accepted representations may include integers, strings
(e.g., ``"I"``, ``"II"``, ``"primary"``, ``"secondary"``), or
other binary encodings.
labels : list-like, optional
Optional labels or identifiers for each observation.
Raises
------
ValueError
If ``minimum_time`` is ``None``.
LengthCheckError
If provided sequences do not match the length of
``minimum_time``.
Notes
-----
All input sequences are normalized internally so that their
lengths match the length of ``minimum_time``. Missing values
are automatically expanded or filled using utilities from
:mod:`ocpy.utils`.
The underlying storage is a :class:`pandas.DataFrame` with
the following standard columns:
- ``minimum_time``
- ``minimum_time_error``
- ``weights``
- ``minimum_type``
- ``labels``
Most modification methods return a **new `Data` instance**
rather than mutating the existing object.
Examples
--------
Create a dataset with minimum times:
>>> from ocpy import Data
>>> d = Data(minimum_time=[2450000.1, 2450001.3, 2450002.5])
With uncertainties and types:
>>> d = Data(
... minimum_time=[2450000.1, 2450001.3],
... minimum_time_error=[0.0002, 0.0003],
... minimum_type=["I", "II"]
... )
Access the underlying table:
>>> d.data
"""
def __init__(
self,
minimum_time: List,
minimum_time_error: Optional[List] = None,
weights: Optional[List] = None,
minimum_type: Optional[BinarySeq] = None,
labels: Optional[List] = None
) -> None:
"""
Initialize a `Data` object containing eclipse minimum timing data.
The constructor creates a standardized internal
:class:`pandas.DataFrame` containing the provided timing
measurements and associated metadata. All optional sequences
are automatically adjusted to match the length of
``minimum_time``.
Parameters
----------
minimum_time : list-like
Times of observed minima. These are typically given in
Julian Date (JD), Barycentric Julian Date (BJD), or a
similar astronomical time system. This parameter is required
and defines the length of the dataset.
minimum_time_error : list-like or float, optional
Uncertainties associated with each minimum time. If a scalar
value is provided, it will be broadcast to all observations.
If a sequence is provided, its length must match
``minimum_time``.
weights : list-like or float, optional
Weights assigned to each observation. If provided as a scalar,
it will be applied to all observations.
minimum_type : BinarySeq, optional
Indicator of the type of minimum. This is commonly used to
distinguish between primary and secondary eclipses.
Typical values include ``"I"``, ``"II"``, ``"primary"``,
``"secondary"``, or numeric equivalents.
labels : list-like, optional
Optional labels or identifiers for each observation,
such as instrument names, observers, or literature sources.
Raises
------
ValueError
If ``minimum_time`` is ``None``.
Notes
-----
All optional sequences are normalized using
:func:`ocpy.utils.Fixer.length_fixer` so that their lengths
match the length of ``minimum_time``. Scalars are automatically
expanded to the correct size.
Internally, the data are stored in a :class:`pandas.DataFrame`
with the following columns:
- ``minimum_time``
- ``minimum_time_error``
- ``weights``
- ``minimum_type``
- ``labels``
Examples
--------
Create a dataset with only minimum times:
>>> Data(minimum_time=[2450000.1, 2450001.2, 2450002.4])
Provide uncertainties and eclipse types:
>>> Data(
... minimum_time=[2450000.1, 2450001.2],
... minimum_time_error=[0.0002, 0.0003],
... minimum_type=["I", "II"]
... )
Use a scalar uncertainty applied to all observations:
>>> Data(
... minimum_time=[2450000.1, 2450001.2],
... minimum_time_error=0.0002
... )
"""
if minimum_time is None:
raise ValueError("`minimum_time` is required and cannot be None.")
fixed_minimum_time_error = Fixer.length_fixer(minimum_time_error, minimum_time)
fixed_weights = Fixer.length_fixer(weights, minimum_time)
fixed_minimum_type = Fixer.length_fixer(minimum_type, minimum_time)
fixed_labels_to = Fixer.length_fixer(labels, minimum_time)
# Convert to list if it's a scalar/None to avoid pandas scalar error
minimum_time_sequence = minimum_time if hasattr(minimum_time, "__len__") else [minimum_time]
self.data = pd.DataFrame(
{
"minimum_time": minimum_time_sequence,
"minimum_time_error": fixed_minimum_time_error,
"weights": fixed_weights,
"minimum_type": fixed_minimum_type,
"labels": fixed_labels_to,
}
)
def __str__(self) -> str:
return self.data.__str__()
def __getitem__(self, item) -> Union[Self, pd.Series]:
"""
Retrieve data from the dataset.
This method provides flexible indexing behavior similar to a
:class:`pandas.DataFrame`. Depending on the type of ``item``,
it can return either a column, a single-row `Data` object, or
a filtered `Data` object.
Parameters
----------
item : str, int, slice, array-like, or pandas-compatible indexer
Index or key used to access the data.
* ``str`` – Returns the corresponding column as a
:class:`pandas.Series`.
* ``int`` – Returns a new `Data` object containing only
the selected row.
* Other indexers (e.g., slices, boolean masks, lists) –
Returns a new `Data` object containing the filtered rows.
Returns
-------
Data or pandas.Series
- If ``item`` is a string, the corresponding column is returned
as a :class:`pandas.Series`.
- Otherwise, a new `Data` object containing the selected rows
is returned.
Notes
-----
This method intentionally mimics the behavior of
:class:`pandas.DataFrame` indexing while preserving the `Data`
abstraction.
When row selection is performed, the result is wrapped into a
new `Data` instance to ensure that all domain-specific methods
(e.g., O–C calculations) remain available.
Examples
--------
Select a column:
>>> d["minimum_time"]
Select a single observation:
>>> d[0]
Filter rows using a boolean mask:
>>> mask = d["weights"] > 1
>>> d_filtered = d[mask]
Slice the dataset:
>>> d_subset = d[:5]
"""
if isinstance(item, str):
return self.data[item]
elif isinstance(item, int):
row = self.data.iloc[item]
return Data(
minimum_time=[row["minimum_time"]],
minimum_time_error=[row["minimum_time_error"]],
weights=[row["weights"]],
minimum_type=[row["minimum_type"]],
labels=[row["labels"]],
)
else:
filtered_table = self.data[item]
return Data(
minimum_time=filtered_table["minimum_time"],
minimum_time_error=filtered_table["minimum_time_error"],
weights=filtered_table["weights"],
minimum_type=filtered_table["minimum_type"],
labels=filtered_table["labels"],
)
def __setitem__(self, key, value) -> None:
"""
Assign values to a column in the dataset.
This method allows column assignment using dictionary-like
syntax, similar to :class:`pandas.DataFrame`. The specified
column will be created if it does not already exist, or
overwritten if it does.
Parameters
----------
key : str
Name of the column to assign or modify.
value : scalar or array-like
Values to assign to the column. Scalars will be broadcast
to all rows, while array-like objects must have a length
compatible with the dataset.
Returns
-------
None
Notes
-----
This operation modifies the underlying :class:`pandas.DataFrame`
in place.
Unlike most other methods of `Data`, which return a new instance,
``__setitem__`` directly mutates the existing object.
Examples
--------
Add a new column:
>>> d["observer"] = ["A", "B", "C"]
Assign a scalar value to all rows:
>>> d["instrument"] = "TESS"
Modify an existing column:
>>> d["weights"] = [1.0, 0.5, 2.0]
"""
self.data.loc[:, key] = value
def __len__(self) -> int:
return len(self.data)
[docs]
@classmethod
def from_file(cls, file: Union[str, Path], columns: Optional[Dict[str, str]] = None) -> Self:
"""
Create a `Data` object from a tabular file.
This method reads timing data from a CSV or Excel file and converts
it into a `Data` instance. Column names in the file can optionally be
mapped to the standardized column names used by the `Data` class.
Parameters
----------
file : str or pathlib.Path
Path to the input file. Supported formats are:
- ``.csv``
- ``.xls``
- ``.xlsx``
columns : dict of str to str, optional
Mapping between the standardized `Data` column names and the
column names present in the file.
Two mapping styles are accepted:
1. **Standard → file column name**
Example:
>>> columns = {"minimum_time": "BJD", "minimum_time_error": "err"}
2. **File column name → standard**
Example:
>>> columns = {"BJD": "minimum_time", "err": "minimum_time_error"}
The standard column names recognized by the `Data` class are:
- ``minimum_time``
- ``minimum_time_error``
- ``weights``
- ``minimum_type``
- ``labels``
Returns
-------
Data
A new `Data` instance containing the imported observations.
Raises
------
ValueError
If the file format is not supported.
ValueError
If the file does not contain a column corresponding to
``minimum_time``.
Notes
-----
The ``minimum_time`` column is required to construct a valid
dataset. All other columns are optional and will be set to
``None`` if not present in the file.
Internally, the file is loaded using :func:`pandas.read_csv`
or :func:`pandas.read_excel`.
Examples
--------
Load a dataset from a CSV file:
>>> d = Data.from_file("minima.csv")
Load a file with custom column names:
>>> d = Data.from_file(
... "observations.csv",
... columns={"BJD": "minimum_time", "err": "minimum_time_error"}
... )
Load an Excel file:
>>> d = Data.from_file("minima.xlsx")
"""
file_path = Path(file)
if file_path.suffix.lower() == ".csv":
df = pd.read_csv(file_path)
elif file_path.suffix.lower() in (".xls", ".xlsx"):
df = pd.read_excel(file_path)
else:
raise ValueError("Unsupported file type. Use `csv`, `xls`, or `xlsx` instead")
expected = ["minimum_time", "minimum_time_error", "weights", "minimum_type", "labels"]
if columns:
if any(k in expected for k in columns.keys()):
rename_map = {v: k for k, v in columns.items()}
else:
rename_map = columns
df = df.rename(columns=rename_map)
if "minimum_time" not in df.columns:
available = list(df.columns)
raise ValueError(f"Could not find 'minimum_time' in file columns. Available columns: {available}. "
f"Please check your 'columns' mapping.")
kwargs = {c: (df[c] if c in df.columns else None) for c in expected}
return cls(**kwargs)
def _assign_or_fill(self, df: pd.DataFrame, col: str, values, override: bool) -> None:
"""
Assign values to a DataFrame column or fill missing entries.
If ``override`` is ``True`` or the column does not exist in the
DataFrame, the column is assigned directly with the provided
values. Otherwise, only the entries that are ``NaN`` are replaced,
leaving existing non-null values unchanged.
Parameters
----------
df : pandas.DataFrame
Target DataFrame in which the column will be modified.
col : str
Name of the column to assign or update.
values : scalar or array-like
Values to assign to the column. Scalars may be broadcast
by pandas, while array-like values must be compatible with
the DataFrame length.
override : bool
If ``True``, the column is completely replaced with the
provided values. If ``False``, only missing values
(``NaN``) in the existing column are filled.
Returns
-------
None
Notes
-----
This is an internal utility method used by functions such as
:meth:`fill_errors`, :meth:`fill_weights`, and
:meth:`calculate_weights` to ensure consistent column updates.
The operation modifies the provided DataFrame in place.
"""
if override or col not in df.columns:
df[col] = values
else:
base = df[col]
df[col] = base.where(~pd.isna(base), values)
[docs]
def fill_errors(self, errors: Union[List, Tuple, np.ndarray, float], override: bool = False) -> Self:
"""
Fill or assign timing uncertainties for the dataset.
This method returns a new `Data` object in which the
``minimum_time_error`` column is populated using the provided
values. Existing values can optionally be preserved or replaced.
Parameters
----------
errors : list, tuple, numpy.ndarray, or float
Uncertainty values to assign to ``minimum_time_error``.
- If a scalar is provided, it will be applied to all rows.
- If an array-like object is provided, its length must match
the number of observations in the dataset.
override : bool, default=False
If ``True``, all existing values in the ``minimum_time_error``
column are replaced.
If ``False``, only entries that are currently ``NaN`` are filled,
leaving existing values unchanged.
Returns
-------
Data
A new `Data` instance with updated ``minimum_time_error`` values.
Raises
------
LengthCheckError
If ``errors`` is array-like and its length does not match the
number of rows in the dataset.
Notes
-----
This method does not modify the original object. Instead,
it returns a new `Data` instance with the updated values.
Internally, column assignment is handled by the private
:meth:`_assign_or_fill` method to ensure consistent behavior
across similar operations.
Examples
--------
Assign a constant uncertainty to all observations:
>>> d2 = d.fill_errors(0.0002)
Fill only missing uncertainties:
>>> d2 = d.fill_errors([0.0002, 0.0003, 0.00025])
Replace all existing uncertainties:
>>> d2 = d.fill_errors(0.0002, override=True)
"""
new_data = deepcopy(self)
if isinstance(errors, (list, tuple, np.ndarray)) and len(errors) != len(new_data.data):
raise LengthCheckError("Length of `errors` must be equal to the length of the data")
self._assign_or_fill(new_data.data, "minimum_time_error", errors, override)
return new_data
[docs]
def fill_weights(self, weights: Union[List, Tuple, np.ndarray, float], override: bool = False) -> Self:
"""
Fill or assign observational weights for the dataset.
This method returns a new `Data` object in which the
``weights`` column is populated using the provided values.
Existing weights can optionally be preserved or replaced.
Parameters
----------
weights : list, tuple, numpy.ndarray, or float
Weight values to assign to the ``weights`` column.
- If a scalar is provided, it will be applied to all rows.
- If an array-like object is provided, its length must match
the number of observations in the dataset.
override : bool, default=False
If ``True``, all existing values in the ``weights`` column
are replaced.
If ``False``, only entries that are currently ``NaN`` are filled,
leaving existing values unchanged.
Returns
-------
Data
A new `Data` instance with updated ``weights`` values.
Raises
------
LengthCheckError
If ``weights`` is array-like and its length does not match the
number of rows in the dataset.
Notes
-----
This method does not modify the original object. Instead,
it returns a new `Data` instance with the updated values.
Internally, column assignment is handled by the private
:meth:`_assign_or_fill` method to ensure consistent behavior
across similar operations.
Examples
--------
Assign a constant weight to all observations:
>>> d2 = d.fill_weights(1.0)
Fill only missing weights:
>>> d2 = d.fill_weights([1.0, 0.5, 2.0])
Replace all existing weights:
>>> d2 = d.fill_weights(1.0, override=True)
"""
new_data = deepcopy(self)
if isinstance(weights, (list, tuple, np.ndarray)) and len(weights) != len(new_data.data):
raise LengthCheckError("Length of `weights` must be equal to the length of the data")
self._assign_or_fill(new_data.data, "weights", weights, override)
return new_data
[docs]
def calculate_weights(self, method: Callable[[pd.Series], pd.Series] = None, override: bool = True) -> Self:
r"""
Calculate observational weights based on timing uncertainties.
This method computes weights for each observation, typically
using the inverse-variance method, and returns a new `Data`
instance with the updated ``weights`` column.
Parameters
----------
method : callable, optional
A custom function to compute weights from the
``minimum_time_error`` series. It must accept a
:class:`pandas.Series` of errors and return a
:class:`pandas.Series` of weights.
If ``None`` (default), the inverse-variance method is used:
.. math::
w_i = \frac{1}{\sigma_i^2}
where :math:`\sigma_i` is the timing uncertainty
for the i-th observation.
override : bool, default=True
If ``True``, existing ``weights`` values are replaced.
If ``False``, only missing entries (``NaN``) are filled.
Returns
-------
Data
A new `Data` instance with updated weights.
Raises
------
ValueError
If ``minimum_time_error`` contains ``NaN`` values.
ValueError
If ``minimum_time_error`` contains zero values, which
would cause division by zero in the default method.
TypeError
If ``method`` is provided but is not callable.
Notes
-----
- The default inverse-variance weighting gives higher weight
to observations with smaller uncertainties.
- This method does not modify the original `Data` instance;
it returns a new instance with updated weights.
- Internally, column assignment uses :meth:`_assign_or_fill`
to respect the ``override`` flag.
Examples
--------
Compute default inverse-variance weights:
>>> d2 = d.calculate_weights()
Compute weights with a custom method:
>>> def custom_weights(errors):
... return 1 / errors
>>> d2 = d.calculate_weights(method=custom_weights, override=True)
Fill only missing weights without overwriting existing ones:
>>> d2 = d.calculate_weights(override=False)
"""
def inverse_variance_weights(err_days: pd.Series) -> pd.Series:
with np.errstate(divide="ignore", invalid="ignore"):
return 1.0 / np.square(err_days)
new_data = deepcopy(self)
minimum_time_error = new_data.data["minimum_time_error"]
if minimum_time_error.hasnans:
raise ValueError("minimum_time_error contains NaN value(s)")
if (minimum_time_error == 0).any():
raise ValueError("minimum_time_error contains `0`")
if method is not None and not callable(method):
raise TypeError("`method` must be callable or None for inverse variance weights")
if method is None:
method = inverse_variance_weights
weights = method(minimum_time_error)
self._assign_or_fill(new_data.data, "weights", weights, override)
return new_data
[docs]
def calculate_oc(self, reference_minimum: float, reference_period: float, model_type: str = "lmfit") -> OC:
"""
Compute Observed minus Calculated (O–C) values for the dataset.
This method calculates the O–C values for each observed minimum
based on a reference minimum time and period. The O–C values
quantify the difference between observed and predicted
timings, which is fundamental for analyzing period variations
in eclipsing binaries.
Parameters
----------
reference_minimum : float
The reference time of minimum (e.g., initial epoch) used
to compute predicted minima.
reference_period : float
The reference orbital period of the system. This is used
to compute the expected timing of each cycle.
model_type : str, default='lmfit'
Specifies the type of O–C model to return. Supported options:
- ``'lmfit'`` or ``'lmfit_model'`` – returns an :class:`OCLMFit` object.
- ``'pymc'`` or ``'pymc_model'`` – returns an :class:`OCPyMC` object.
- Any other string – returns a generic :class:`OC` object.
Returns
-------
OC
An instance of the appropriate O–C model class
(:class:`OC`, :class:`OCLMFit`, or :class:`OCPyMC`)
containing:
- ``minimum_time`` – observed minima
- ``cycle`` – computed cycle numbers (integer or half-integer for secondary minima)
- ``oc`` – O–C values
- Additional columns from the original `Data` (errors, weights, labels, minimum_type)
Raises
------
ValueError
If the ``minimum_time`` column is missing.
Notes
-----
- **Cycle calculation:** The phase of each observation is computed as:
.. math::
\text{phase} = \frac{t - \text{reference_minimum}}{\text{reference_period}}
The cycle number is the nearest integer to the phase.
- **Secondary minima:** If ``minimum_type`` is present and indicates
a secondary eclipse (e.g., "II", "secondary", 2), the cycle is
adjusted to half-integer values:
.. math::
\text{cycle}_{\text{sec}} = \text{round}(\text{phase} - 0.5) + 0.5
- **O–C computation:** The O–C value for each observation is:
.. math::
\text{O–C} = t_{\text{obs}} - ( \text{reference_minimum} + \text{cycle} \times \text{reference_period} )
- The method **does not modify the original `Data`**. The returned
object contains a copy of the original data along with the computed
``cycle`` and ``oc`` arrays.
- The ``model_type`` determines which class is instantiated for
further modeling of the O–C diagram.
Examples
--------
Compute O–C values using the default LMFit model:
>>> oc_model = d.calculate_oc(reference_minimum=2450000.0, reference_period=1.2345)
Compute O–C values using a PyMC model:
>>> oc_model = d.calculate_oc(
... reference_minimum=2450000.0,
... reference_period=1.2345,
... model_type="pymc"
... )
Access the computed O–C values:
>>> oc_model.oc
[0.0001, -0.0002, 0.0003, ...]
"""
df = self.data.copy()
if "minimum_time" not in df.columns:
raise ValueError("`minimum_time` column is required to compute O–C.")
t = np.asarray(df["minimum_time"].to_numpy(), dtype=float)
phase = (t - reference_minimum) / reference_period
cycle = np.rint(phase)
if "minimum_type" in df.columns:
vals = df["minimum_type"].to_numpy()
sec = np.zeros_like(t, dtype=bool)
for i, v in enumerate(vals):
if v is None or (isinstance(v, float) and np.isnan(v)):
continue
s = str(v).strip().lower()
if s in {"1", "ii", "sec", "secondary", "s"} or "ii" in s:
sec[i] = True
elif s in {"0", "i", "pri", "primary", "p"}:
sec[i] = False
else:
try:
n = int(s)
sec[i] = (n == 2)
except Exception:
pass
if np.any(sec):
cycle_sec = np.rint(phase - 0.5) + 0.5
cycle = np.where(sec, cycle_sec, cycle)
calculated = reference_minimum + cycle * reference_period
oc = (t - calculated).astype(float).tolist()
new_data: Dict[str, Optional[list]] = {
"minimum_time": df["minimum_time"].tolist(),
"minimum_time_error": df["minimum_time_error"].tolist() if "minimum_time_error" in df else None,
"weights": df["weights"].tolist() if "weights" in df else None,
"minimum_type": df["minimum_type"].tolist() if "minimum_type" in df else None,
"labels": df["labels"].tolist() if "labels" in df else None,
}
common_kwargs = dict(
minimum_time=new_data["minimum_time"],
minimum_time_error=new_data["minimum_time_error"],
weights=new_data["weights"],
minimum_type=new_data["minimum_type"],
labels=new_data["labels"],
cycle=cycle,
oc=oc,
)
targets = str(model_type).strip().lower()
if targets in {"lmfit", "lmfit_model"}:
Target = OCLMFit
elif targets in {"pymc", "pymc_model"}:
Target = OCPyMC
else:
Target = OC
return Target(**common_kwargs)
[docs]
def merge(self, data: Self) -> Self:
"""
Merge the current dataset with another `Data` object.
This method concatenates the rows of the current `Data` instance
with those of another `Data` object, returning a new `Data`
instance. Column alignment is based on column names.
Parameters
----------
data : Data
Another `Data` instance to merge with the current dataset.
Returns
-------
Data
A new `Data` instance containing all rows from both datasets.
Notes
-----
- The original datasets are not modified.
- Missing columns in either dataset will result in ``NaN``
values in the merged dataset, following pandas' concatenation rules.
- Indexes are reset in the merged dataset for consistency.
Examples
--------
>>> d1 = Data(minimum_time=[2450000.1, 2450001.2])
>>> d2 = Data(minimum_time=[2450002.3, 2450003.4])
>>> d_merged = d1.merge(d2)
>>> len(d_merged)
4
"""
new_data = deepcopy(self)
new_data.data = pd.concat([self.data, data.data], ignore_index=True, sort=False)
return new_data
[docs]
def group_by(self, column: str) -> List[Self]:
"""
Split the dataset into groups based on a column.
This method groups the `Data` object by the values in a specified
column and returns a list of new `Data` instances, each containing
one group of rows.
Parameters
----------
column : str
Name of the column to group by.
Returns
-------
list of Data
A list of `Data` objects, each corresponding to one group.
If the column is missing or contains only NaN values, a list
with a single copy of the original dataset is returned.
Notes
-----
- Grouping is performed using :meth:`pandas.DataFrame.groupby`.
- The original `Data` object is not modified; each group is a
deep copy.
- NaN values are treated as a separate group unless ``dropna=True``
in the internal pandas grouping.
Examples
--------
Group a dataset by the ``minimum_type`` column:
>>> groups = d.group_by("minimum_type")
>>> len(groups)
2 # e.g., one group for primary, one for secondary minima
Access the first group:
>>> groups[0].data
minimum_time minimum_type
2450000.1 I
2450002.3 I
If the grouping column does not exist:
>>> groups = d.group_by("nonexistent_column")
>>> len(groups)
1 # returns a single copy of the original Data
"""
if column not in self.data.columns:
return [deepcopy(self)]
s = self.data[column]
if s.isna().all():
return [deepcopy(self)]
groups: List["Data"] = []
for _, df_group in self.data.groupby(s, dropna=False):
new_obj = deepcopy(self)
new_obj.data = df_group.copy()
groups.append(new_obj)
return groups