Source code for ocpy.data

from pathlib import Path
from typing import Union, Optional, Dict, List, Tuple, Callable

from typing_extensions import Self

import pandas as pd
import numpy as np
from copy import deepcopy

from ocpy.model_data import DataModel
from ocpy.custom_types import BinarySeq
from ocpy.utils import Fixer

from .errors import LengthCheckError
from .oc import OC
from .oc_lmfit import OCLMFit
from .oc_pymc import OCPyMC


[docs] class Data(DataModel): """ Container for eclipse minimum timing data. The `Data` class stores and manages observational times of minima (e.g., eclipse timings of binary stars) together with optional metadata such as timing uncertainties, observational weights, minimum type (primary/secondary), and labels. Internally the data are stored in a :class:`pandas.DataFrame` with standardized column names. The class provides utilities for safely manipulating the dataset, including: * Filling or computing timing uncertainties and weights * Loading data from files * Computing O–C (Observed minus Calculated) values * Grouping or merging datasets The class is designed to behave like a lightweight table object while preserving domain-specific semantics required for O–C analysis of eclipsing binaries. Parameters ---------- minimum_time : list-like Times of observed minima (typically in Julian Date or BJD). This field is required. minimum_time_error : list-like, optional Uncertainties associated with each minimum time. If provided, the length must match ``minimum_time``. weights : list-like, optional Weights assigned to each observation. minimum_type : BinarySeq, optional Indicator of minimum type (e.g., primary or secondary eclipse). Accepted representations may include integers, strings (e.g., ``"I"``, ``"II"``, ``"primary"``, ``"secondary"``), or other binary encodings. labels : list-like, optional Optional labels or identifiers for each observation. Raises ------ ValueError If ``minimum_time`` is ``None``. LengthCheckError If provided sequences do not match the length of ``minimum_time``. Notes ----- All input sequences are normalized internally so that their lengths match the length of ``minimum_time``. Missing values are automatically expanded or filled using utilities from :mod:`ocpy.utils`. The underlying storage is a :class:`pandas.DataFrame` with the following standard columns: - ``minimum_time`` - ``minimum_time_error`` - ``weights`` - ``minimum_type`` - ``labels`` Most modification methods return a **new `Data` instance** rather than mutating the existing object. Examples -------- Create a dataset with minimum times: >>> from ocpy import Data >>> d = Data(minimum_time=[2450000.1, 2450001.3, 2450002.5]) With uncertainties and types: >>> d = Data( ... minimum_time=[2450000.1, 2450001.3], ... minimum_time_error=[0.0002, 0.0003], ... minimum_type=["I", "II"] ... ) Access the underlying table: >>> d.data """ def __init__( self, minimum_time: List, minimum_time_error: Optional[List] = None, weights: Optional[List] = None, minimum_type: Optional[BinarySeq] = None, labels: Optional[List] = None ) -> None: """ Initialize a `Data` object containing eclipse minimum timing data. The constructor creates a standardized internal :class:`pandas.DataFrame` containing the provided timing measurements and associated metadata. All optional sequences are automatically adjusted to match the length of ``minimum_time``. Parameters ---------- minimum_time : list-like Times of observed minima. These are typically given in Julian Date (JD), Barycentric Julian Date (BJD), or a similar astronomical time system. This parameter is required and defines the length of the dataset. minimum_time_error : list-like or float, optional Uncertainties associated with each minimum time. If a scalar value is provided, it will be broadcast to all observations. If a sequence is provided, its length must match ``minimum_time``. weights : list-like or float, optional Weights assigned to each observation. If provided as a scalar, it will be applied to all observations. minimum_type : BinarySeq, optional Indicator of the type of minimum. This is commonly used to distinguish between primary and secondary eclipses. Typical values include ``"I"``, ``"II"``, ``"primary"``, ``"secondary"``, or numeric equivalents. labels : list-like, optional Optional labels or identifiers for each observation, such as instrument names, observers, or literature sources. Raises ------ ValueError If ``minimum_time`` is ``None``. Notes ----- All optional sequences are normalized using :func:`ocpy.utils.Fixer.length_fixer` so that their lengths match the length of ``minimum_time``. Scalars are automatically expanded to the correct size. Internally, the data are stored in a :class:`pandas.DataFrame` with the following columns: - ``minimum_time`` - ``minimum_time_error`` - ``weights`` - ``minimum_type`` - ``labels`` Examples -------- Create a dataset with only minimum times: >>> Data(minimum_time=[2450000.1, 2450001.2, 2450002.4]) Provide uncertainties and eclipse types: >>> Data( ... minimum_time=[2450000.1, 2450001.2], ... minimum_time_error=[0.0002, 0.0003], ... minimum_type=["I", "II"] ... ) Use a scalar uncertainty applied to all observations: >>> Data( ... minimum_time=[2450000.1, 2450001.2], ... minimum_time_error=0.0002 ... ) """ if minimum_time is None: raise ValueError("`minimum_time` is required and cannot be None.") fixed_minimum_time_error = Fixer.length_fixer(minimum_time_error, minimum_time) fixed_weights = Fixer.length_fixer(weights, minimum_time) fixed_minimum_type = Fixer.length_fixer(minimum_type, minimum_time) fixed_labels_to = Fixer.length_fixer(labels, minimum_time) # Convert to list if it's a scalar/None to avoid pandas scalar error minimum_time_sequence = minimum_time if hasattr(minimum_time, "__len__") else [minimum_time] self.data = pd.DataFrame( { "minimum_time": minimum_time_sequence, "minimum_time_error": fixed_minimum_time_error, "weights": fixed_weights, "minimum_type": fixed_minimum_type, "labels": fixed_labels_to, } ) def __str__(self) -> str: return self.data.__str__() def __getitem__(self, item) -> Union[Self, pd.Series]: """ Retrieve data from the dataset. This method provides flexible indexing behavior similar to a :class:`pandas.DataFrame`. Depending on the type of ``item``, it can return either a column, a single-row `Data` object, or a filtered `Data` object. Parameters ---------- item : str, int, slice, array-like, or pandas-compatible indexer Index or key used to access the data. * ``str`` – Returns the corresponding column as a :class:`pandas.Series`. * ``int`` – Returns a new `Data` object containing only the selected row. * Other indexers (e.g., slices, boolean masks, lists) – Returns a new `Data` object containing the filtered rows. Returns ------- Data or pandas.Series - If ``item`` is a string, the corresponding column is returned as a :class:`pandas.Series`. - Otherwise, a new `Data` object containing the selected rows is returned. Notes ----- This method intentionally mimics the behavior of :class:`pandas.DataFrame` indexing while preserving the `Data` abstraction. When row selection is performed, the result is wrapped into a new `Data` instance to ensure that all domain-specific methods (e.g., O–C calculations) remain available. Examples -------- Select a column: >>> d["minimum_time"] Select a single observation: >>> d[0] Filter rows using a boolean mask: >>> mask = d["weights"] > 1 >>> d_filtered = d[mask] Slice the dataset: >>> d_subset = d[:5] """ if isinstance(item, str): return self.data[item] elif isinstance(item, int): row = self.data.iloc[item] return Data( minimum_time=[row["minimum_time"]], minimum_time_error=[row["minimum_time_error"]], weights=[row["weights"]], minimum_type=[row["minimum_type"]], labels=[row["labels"]], ) else: filtered_table = self.data[item] return Data( minimum_time=filtered_table["minimum_time"], minimum_time_error=filtered_table["minimum_time_error"], weights=filtered_table["weights"], minimum_type=filtered_table["minimum_type"], labels=filtered_table["labels"], ) def __setitem__(self, key, value) -> None: """ Assign values to a column in the dataset. This method allows column assignment using dictionary-like syntax, similar to :class:`pandas.DataFrame`. The specified column will be created if it does not already exist, or overwritten if it does. Parameters ---------- key : str Name of the column to assign or modify. value : scalar or array-like Values to assign to the column. Scalars will be broadcast to all rows, while array-like objects must have a length compatible with the dataset. Returns ------- None Notes ----- This operation modifies the underlying :class:`pandas.DataFrame` in place. Unlike most other methods of `Data`, which return a new instance, ``__setitem__`` directly mutates the existing object. Examples -------- Add a new column: >>> d["observer"] = ["A", "B", "C"] Assign a scalar value to all rows: >>> d["instrument"] = "TESS" Modify an existing column: >>> d["weights"] = [1.0, 0.5, 2.0] """ self.data.loc[:, key] = value def __len__(self) -> int: return len(self.data)
[docs] @classmethod def from_file(cls, file: Union[str, Path], columns: Optional[Dict[str, str]] = None) -> Self: """ Create a `Data` object from a tabular file. This method reads timing data from a CSV or Excel file and converts it into a `Data` instance. Column names in the file can optionally be mapped to the standardized column names used by the `Data` class. Parameters ---------- file : str or pathlib.Path Path to the input file. Supported formats are: - ``.csv`` - ``.xls`` - ``.xlsx`` columns : dict of str to str, optional Mapping between the standardized `Data` column names and the column names present in the file. Two mapping styles are accepted: 1. **Standard → file column name** Example: >>> columns = {"minimum_time": "BJD", "minimum_time_error": "err"} 2. **File column name → standard** Example: >>> columns = {"BJD": "minimum_time", "err": "minimum_time_error"} The standard column names recognized by the `Data` class are: - ``minimum_time`` - ``minimum_time_error`` - ``weights`` - ``minimum_type`` - ``labels`` Returns ------- Data A new `Data` instance containing the imported observations. Raises ------ ValueError If the file format is not supported. ValueError If the file does not contain a column corresponding to ``minimum_time``. Notes ----- The ``minimum_time`` column is required to construct a valid dataset. All other columns are optional and will be set to ``None`` if not present in the file. Internally, the file is loaded using :func:`pandas.read_csv` or :func:`pandas.read_excel`. Examples -------- Load a dataset from a CSV file: >>> d = Data.from_file("minima.csv") Load a file with custom column names: >>> d = Data.from_file( ... "observations.csv", ... columns={"BJD": "minimum_time", "err": "minimum_time_error"} ... ) Load an Excel file: >>> d = Data.from_file("minima.xlsx") """ file_path = Path(file) if file_path.suffix.lower() == ".csv": df = pd.read_csv(file_path) elif file_path.suffix.lower() in (".xls", ".xlsx"): df = pd.read_excel(file_path) else: raise ValueError("Unsupported file type. Use `csv`, `xls`, or `xlsx` instead") expected = ["minimum_time", "minimum_time_error", "weights", "minimum_type", "labels"] if columns: if any(k in expected for k in columns.keys()): rename_map = {v: k for k, v in columns.items()} else: rename_map = columns df = df.rename(columns=rename_map) if "minimum_time" not in df.columns: available = list(df.columns) raise ValueError(f"Could not find 'minimum_time' in file columns. Available columns: {available}. " f"Please check your 'columns' mapping.") kwargs = {c: (df[c] if c in df.columns else None) for c in expected} return cls(**kwargs)
def _assign_or_fill(self, df: pd.DataFrame, col: str, values, override: bool) -> None: """ Assign values to a DataFrame column or fill missing entries. If ``override`` is ``True`` or the column does not exist in the DataFrame, the column is assigned directly with the provided values. Otherwise, only the entries that are ``NaN`` are replaced, leaving existing non-null values unchanged. Parameters ---------- df : pandas.DataFrame Target DataFrame in which the column will be modified. col : str Name of the column to assign or update. values : scalar or array-like Values to assign to the column. Scalars may be broadcast by pandas, while array-like values must be compatible with the DataFrame length. override : bool If ``True``, the column is completely replaced with the provided values. If ``False``, only missing values (``NaN``) in the existing column are filled. Returns ------- None Notes ----- This is an internal utility method used by functions such as :meth:`fill_errors`, :meth:`fill_weights`, and :meth:`calculate_weights` to ensure consistent column updates. The operation modifies the provided DataFrame in place. """ if override or col not in df.columns: df[col] = values else: base = df[col] df[col] = base.where(~pd.isna(base), values)
[docs] def fill_errors(self, errors: Union[List, Tuple, np.ndarray, float], override: bool = False) -> Self: """ Fill or assign timing uncertainties for the dataset. This method returns a new `Data` object in which the ``minimum_time_error`` column is populated using the provided values. Existing values can optionally be preserved or replaced. Parameters ---------- errors : list, tuple, numpy.ndarray, or float Uncertainty values to assign to ``minimum_time_error``. - If a scalar is provided, it will be applied to all rows. - If an array-like object is provided, its length must match the number of observations in the dataset. override : bool, default=False If ``True``, all existing values in the ``minimum_time_error`` column are replaced. If ``False``, only entries that are currently ``NaN`` are filled, leaving existing values unchanged. Returns ------- Data A new `Data` instance with updated ``minimum_time_error`` values. Raises ------ LengthCheckError If ``errors`` is array-like and its length does not match the number of rows in the dataset. Notes ----- This method does not modify the original object. Instead, it returns a new `Data` instance with the updated values. Internally, column assignment is handled by the private :meth:`_assign_or_fill` method to ensure consistent behavior across similar operations. Examples -------- Assign a constant uncertainty to all observations: >>> d2 = d.fill_errors(0.0002) Fill only missing uncertainties: >>> d2 = d.fill_errors([0.0002, 0.0003, 0.00025]) Replace all existing uncertainties: >>> d2 = d.fill_errors(0.0002, override=True) """ new_data = deepcopy(self) if isinstance(errors, (list, tuple, np.ndarray)) and len(errors) != len(new_data.data): raise LengthCheckError("Length of `errors` must be equal to the length of the data") self._assign_or_fill(new_data.data, "minimum_time_error", errors, override) return new_data
[docs] def fill_weights(self, weights: Union[List, Tuple, np.ndarray, float], override: bool = False) -> Self: """ Fill or assign observational weights for the dataset. This method returns a new `Data` object in which the ``weights`` column is populated using the provided values. Existing weights can optionally be preserved or replaced. Parameters ---------- weights : list, tuple, numpy.ndarray, or float Weight values to assign to the ``weights`` column. - If a scalar is provided, it will be applied to all rows. - If an array-like object is provided, its length must match the number of observations in the dataset. override : bool, default=False If ``True``, all existing values in the ``weights`` column are replaced. If ``False``, only entries that are currently ``NaN`` are filled, leaving existing values unchanged. Returns ------- Data A new `Data` instance with updated ``weights`` values. Raises ------ LengthCheckError If ``weights`` is array-like and its length does not match the number of rows in the dataset. Notes ----- This method does not modify the original object. Instead, it returns a new `Data` instance with the updated values. Internally, column assignment is handled by the private :meth:`_assign_or_fill` method to ensure consistent behavior across similar operations. Examples -------- Assign a constant weight to all observations: >>> d2 = d.fill_weights(1.0) Fill only missing weights: >>> d2 = d.fill_weights([1.0, 0.5, 2.0]) Replace all existing weights: >>> d2 = d.fill_weights(1.0, override=True) """ new_data = deepcopy(self) if isinstance(weights, (list, tuple, np.ndarray)) and len(weights) != len(new_data.data): raise LengthCheckError("Length of `weights` must be equal to the length of the data") self._assign_or_fill(new_data.data, "weights", weights, override) return new_data
[docs] def calculate_weights(self, method: Callable[[pd.Series], pd.Series] = None, override: bool = True) -> Self: r""" Calculate observational weights based on timing uncertainties. This method computes weights for each observation, typically using the inverse-variance method, and returns a new `Data` instance with the updated ``weights`` column. Parameters ---------- method : callable, optional A custom function to compute weights from the ``minimum_time_error`` series. It must accept a :class:`pandas.Series` of errors and return a :class:`pandas.Series` of weights. If ``None`` (default), the inverse-variance method is used: .. math:: w_i = \frac{1}{\sigma_i^2} where :math:`\sigma_i` is the timing uncertainty for the i-th observation. override : bool, default=True If ``True``, existing ``weights`` values are replaced. If ``False``, only missing entries (``NaN``) are filled. Returns ------- Data A new `Data` instance with updated weights. Raises ------ ValueError If ``minimum_time_error`` contains ``NaN`` values. ValueError If ``minimum_time_error`` contains zero values, which would cause division by zero in the default method. TypeError If ``method`` is provided but is not callable. Notes ----- - The default inverse-variance weighting gives higher weight to observations with smaller uncertainties. - This method does not modify the original `Data` instance; it returns a new instance with updated weights. - Internally, column assignment uses :meth:`_assign_or_fill` to respect the ``override`` flag. Examples -------- Compute default inverse-variance weights: >>> d2 = d.calculate_weights() Compute weights with a custom method: >>> def custom_weights(errors): ... return 1 / errors >>> d2 = d.calculate_weights(method=custom_weights, override=True) Fill only missing weights without overwriting existing ones: >>> d2 = d.calculate_weights(override=False) """ def inverse_variance_weights(err_days: pd.Series) -> pd.Series: with np.errstate(divide="ignore", invalid="ignore"): return 1.0 / np.square(err_days) new_data = deepcopy(self) minimum_time_error = new_data.data["minimum_time_error"] if minimum_time_error.hasnans: raise ValueError("minimum_time_error contains NaN value(s)") if (minimum_time_error == 0).any(): raise ValueError("minimum_time_error contains `0`") if method is not None and not callable(method): raise TypeError("`method` must be callable or None for inverse variance weights") if method is None: method = inverse_variance_weights weights = method(minimum_time_error) self._assign_or_fill(new_data.data, "weights", weights, override) return new_data
[docs] def calculate_oc(self, reference_minimum: float, reference_period: float, model_type: str = "lmfit") -> OC: """ Compute Observed minus Calculated (O–C) values for the dataset. This method calculates the O–C values for each observed minimum based on a reference minimum time and period. The O–C values quantify the difference between observed and predicted timings, which is fundamental for analyzing period variations in eclipsing binaries. Parameters ---------- reference_minimum : float The reference time of minimum (e.g., initial epoch) used to compute predicted minima. reference_period : float The reference orbital period of the system. This is used to compute the expected timing of each cycle. model_type : str, default='lmfit' Specifies the type of O–C model to return. Supported options: - ``'lmfit'`` or ``'lmfit_model'`` – returns an :class:`OCLMFit` object. - ``'pymc'`` or ``'pymc_model'`` – returns an :class:`OCPyMC` object. - Any other string – returns a generic :class:`OC` object. Returns ------- OC An instance of the appropriate O–C model class (:class:`OC`, :class:`OCLMFit`, or :class:`OCPyMC`) containing: - ``minimum_time`` – observed minima - ``cycle`` – computed cycle numbers (integer or half-integer for secondary minima) - ``oc`` – O–C values - Additional columns from the original `Data` (errors, weights, labels, minimum_type) Raises ------ ValueError If the ``minimum_time`` column is missing. Notes ----- - **Cycle calculation:** The phase of each observation is computed as: .. math:: \text{phase} = \frac{t - \text{reference_minimum}}{\text{reference_period}} The cycle number is the nearest integer to the phase. - **Secondary minima:** If ``minimum_type`` is present and indicates a secondary eclipse (e.g., "II", "secondary", 2), the cycle is adjusted to half-integer values: .. math:: \text{cycle}_{\text{sec}} = \text{round}(\text{phase} - 0.5) + 0.5 - **O–C computation:** The O–C value for each observation is: .. math:: \text{O–C} = t_{\text{obs}} - ( \text{reference_minimum} + \text{cycle} \times \text{reference_period} ) - The method **does not modify the original `Data`**. The returned object contains a copy of the original data along with the computed ``cycle`` and ``oc`` arrays. - The ``model_type`` determines which class is instantiated for further modeling of the O–C diagram. Examples -------- Compute O–C values using the default LMFit model: >>> oc_model = d.calculate_oc(reference_minimum=2450000.0, reference_period=1.2345) Compute O–C values using a PyMC model: >>> oc_model = d.calculate_oc( ... reference_minimum=2450000.0, ... reference_period=1.2345, ... model_type="pymc" ... ) Access the computed O–C values: >>> oc_model.oc [0.0001, -0.0002, 0.0003, ...] """ df = self.data.copy() if "minimum_time" not in df.columns: raise ValueError("`minimum_time` column is required to compute O–C.") t = np.asarray(df["minimum_time"].to_numpy(), dtype=float) phase = (t - reference_minimum) / reference_period cycle = np.rint(phase) if "minimum_type" in df.columns: vals = df["minimum_type"].to_numpy() sec = np.zeros_like(t, dtype=bool) for i, v in enumerate(vals): if v is None or (isinstance(v, float) and np.isnan(v)): continue s = str(v).strip().lower() if s in {"1", "ii", "sec", "secondary", "s"} or "ii" in s: sec[i] = True elif s in {"0", "i", "pri", "primary", "p"}: sec[i] = False else: try: n = int(s) sec[i] = (n == 2) except Exception: pass if np.any(sec): cycle_sec = np.rint(phase - 0.5) + 0.5 cycle = np.where(sec, cycle_sec, cycle) calculated = reference_minimum + cycle * reference_period oc = (t - calculated).astype(float).tolist() new_data: Dict[str, Optional[list]] = { "minimum_time": df["minimum_time"].tolist(), "minimum_time_error": df["minimum_time_error"].tolist() if "minimum_time_error" in df else None, "weights": df["weights"].tolist() if "weights" in df else None, "minimum_type": df["minimum_type"].tolist() if "minimum_type" in df else None, "labels": df["labels"].tolist() if "labels" in df else None, } common_kwargs = dict( minimum_time=new_data["minimum_time"], minimum_time_error=new_data["minimum_time_error"], weights=new_data["weights"], minimum_type=new_data["minimum_type"], labels=new_data["labels"], cycle=cycle, oc=oc, ) targets = str(model_type).strip().lower() if targets in {"lmfit", "lmfit_model"}: Target = OCLMFit elif targets in {"pymc", "pymc_model"}: Target = OCPyMC else: Target = OC return Target(**common_kwargs)
[docs] def merge(self, data: Self) -> Self: """ Merge the current dataset with another `Data` object. This method concatenates the rows of the current `Data` instance with those of another `Data` object, returning a new `Data` instance. Column alignment is based on column names. Parameters ---------- data : Data Another `Data` instance to merge with the current dataset. Returns ------- Data A new `Data` instance containing all rows from both datasets. Notes ----- - The original datasets are not modified. - Missing columns in either dataset will result in ``NaN`` values in the merged dataset, following pandas' concatenation rules. - Indexes are reset in the merged dataset for consistency. Examples -------- >>> d1 = Data(minimum_time=[2450000.1, 2450001.2]) >>> d2 = Data(minimum_time=[2450002.3, 2450003.4]) >>> d_merged = d1.merge(d2) >>> len(d_merged) 4 """ new_data = deepcopy(self) new_data.data = pd.concat([self.data, data.data], ignore_index=True, sort=False) return new_data
[docs] def group_by(self, column: str) -> List[Self]: """ Split the dataset into groups based on a column. This method groups the `Data` object by the values in a specified column and returns a list of new `Data` instances, each containing one group of rows. Parameters ---------- column : str Name of the column to group by. Returns ------- list of Data A list of `Data` objects, each corresponding to one group. If the column is missing or contains only NaN values, a list with a single copy of the original dataset is returned. Notes ----- - Grouping is performed using :meth:`pandas.DataFrame.groupby`. - The original `Data` object is not modified; each group is a deep copy. - NaN values are treated as a separate group unless ``dropna=True`` in the internal pandas grouping. Examples -------- Group a dataset by the ``minimum_type`` column: >>> groups = d.group_by("minimum_type") >>> len(groups) 2 # e.g., one group for primary, one for secondary minima Access the first group: >>> groups[0].data minimum_time minimum_type 2450000.1 I 2450002.3 I If the grouping column does not exist: >>> groups = d.group_by("nonexistent_column") >>> len(groups) 1 # returns a single copy of the original Data """ if column not in self.data.columns: return [deepcopy(self)] s = self.data[column] if s.isna().all(): return [deepcopy(self)] groups: List["Data"] = [] for _, df_group in self.data.groupby(s, dropna=False): new_obj = deepcopy(self) new_obj.data = df_group.copy() groups.append(new_obj) return groups