Source code for mdr.io.readers

"""
Data readers for Macrodata Refinement (MDR).

This module provides functions and classes for reading macrodata
from various file formats.
"""

import os
from typing import Dict, List, Union, Optional, Any, Tuple, Type
from abc import ABC, abstractmethod
import numpy as np
import pandas as pd
from enum import Enum, auto
import json
import csv



[docs]
class DataSource(Enum):
    """Types of data sources."""
    
    FILE = auto()
    DATABASE = auto()
    API = auto()
    MEMORY = auto()




[docs]
class DataReader(ABC):
    """Abstract base class for data readers."""
    

[docs]
    def __init__(self, source_type: DataSource = DataSource.FILE):
        """
        Initialize the data reader.
        
        Args:
            source_type: Type of data source
        """
        assert isinstance(source_type, DataSource), "source_type must be a DataSource enum"
        self.source_type = source_type

    

[docs]
    @abstractmethod
    def read(self, source: str, **options) -> Dict[str, np.ndarray]:
        """
        Read data from the source.
        
        Args:
            source: Source identifier (file path, table name, etc.)
            **options: Additional reading options
            
        Returns:
            Dictionary mapping variable names to data arrays
        """
        pass

    

[docs]
    @abstractmethod
    def validate_source(self, source: str) -> bool:
        """
        Validate if the source exists and is readable.
        
        Args:
            source: Source identifier
            
        Returns:
            True if the source is valid, False otherwise
        """
        pass





[docs]
class FileReader(DataReader):
    """Base class for file-based data readers."""
    

[docs]
    def __init__(self, encoding: str = "utf-8"):
        """
        Initialize the file reader.
        
        Args:
            encoding: File encoding
        """
        super().__init__(DataSource.FILE)
        assert isinstance(encoding, str), "encoding must be a string"
        self.encoding = encoding

    

[docs]
    def validate_source(self, source: str) -> bool:
        """
        Validate if the file exists and is readable.
        
        Args:
            source: File path
            
        Returns:
            True if the file is valid, False otherwise
        """
        assert isinstance(source, str), "source must be a string"
        return os.path.isfile(source) and os.access(source, os.R_OK)





[docs]
class CSVReader(FileReader):
    """Reader for CSV files."""
    

[docs]
    def __init__(
        self,
        delimiter: str = ",",
        quotechar: str = '"',
        encoding: str = "utf-8"
    ):
        """
        Initialize the CSV reader.
        
        Args:
            delimiter: Field delimiter
            quotechar: Character for quoting fields
            encoding: File encoding
        """
        super().__init__(encoding=encoding)
        assert isinstance(delimiter, str), "delimiter must be a string"
        assert isinstance(quotechar, str), "quotechar must be a string"
        assert len(delimiter) == 1, "delimiter must be a single character"
        assert len(quotechar) == 1, "quotechar must be a single character"
        
        self.delimiter = delimiter
        self.quotechar = quotechar

    

[docs]
    def read(
        self,
        source: str,
        header: bool = True,
        index_col: Optional[Union[int, str]] = None,
        na_values: List[str] = None,
        parse_dates: bool = False,
        **options
    ) -> Dict[str, np.ndarray]:
        """
        Read data from a CSV file.
        
        Args:
            source: File path
            header: Whether to use the first row as column names
            index_col: Column to use as the index
            na_values: List of strings to interpret as NA/NaN
            parse_dates: Whether to parse date columns
            **options: Additional pandas.read_csv options
            
        Returns:
            Dictionary mapping column names to data arrays
        """
        assert isinstance(source, str), "source must be a string"
        assert isinstance(header, bool), "header must be a boolean"
        if index_col is not None:
            assert isinstance(index_col, (int, str)), "index_col must be an integer or string"
        if na_values is not None:
            assert isinstance(na_values, list), "na_values must be a list"
        assert isinstance(parse_dates, bool), "parse_dates must be a boolean"
        
        if not self.validate_source(source):
            raise ValueError(f"Invalid or inaccessible file: {source}")
        
        # Read CSV using pandas
        df = pd.read_csv(
            source,
            delimiter=self.delimiter,
            header=0 if header else None,
            index_col=index_col,
            na_values=na_values,
            parse_dates=parse_dates,
            quotechar=self.quotechar,
            encoding=self.encoding,
            **options
        )
        
        # Convert DataFrame to dictionary of numpy arrays
        data_dict = {}
        for column in df.columns:
            data_dict[str(column)] = df[column].to_numpy()
        
        return data_dict





[docs]
class JSONReader(FileReader):
    """Reader for JSON files."""
    

[docs]
    def read(
        self,
        source: str,
        orient: str = "columns",
        convert_dates: bool = True,
        **options
    ) -> Dict[str, np.ndarray]:
        """
        Read data from a JSON file.
        
        Args:
            source: File path
            orient: Expected JSON dict format, one of
                   ['columns', 'records', 'index', 'split', 'values']
            convert_dates: Whether to convert date strings to datetime objects
            **options: Additional pandas.read_json options
            
        Returns:
            Dictionary mapping column names to data arrays
        """
        assert isinstance(source, str), "source must be a string"
        assert isinstance(orient, str), "orient must be a string"
        assert orient in ["columns", "records", "index", "split", "values"], \
            "orient must be one of ['columns', 'records', 'index', 'split', 'values']"
        assert isinstance(convert_dates, bool), "convert_dates must be a boolean"
        
        if not self.validate_source(source):
            raise ValueError(f"Invalid or inaccessible file: {source}")
        
        # Read JSON using pandas
        df = pd.read_json(
            source,
            orient=orient,
            convert_dates=convert_dates,
            **options
        )
        
        # Convert DataFrame to dictionary of numpy arrays
        data_dict = {}
        for column in df.columns:
            data_dict[str(column)] = df[column].to_numpy()
        
        return data_dict





[docs]
class ExcelReader(FileReader):
    """Reader for Excel files."""
    

[docs]
    def read(
        self,
        source: str,
        sheet_name: Union[str, int, List, None] = 0,
        header: int = 0,
        na_values: List[str] = None,
        **options
    ) -> Dict[str, np.ndarray]:
        """
        Read data from an Excel file.
        
        Args:
            source: File path
            sheet_name: Name, index, or list of sheets to read
            header: Row to use for column names (0-indexed)
            na_values: List of strings to interpret as NA/NaN
            **options: Additional pandas.read_excel options
            
        Returns:
            Dictionary mapping column names to data arrays
        """
        assert isinstance(source, str), "source must be a string"
        assert isinstance(header, int), "header must be an integer"
        assert header >= 0, "header must be a non-negative integer"
        if na_values is not None:
            assert isinstance(na_values, list), "na_values must be a list"
        
        if not self.validate_source(source):
            raise ValueError(f"Invalid or inaccessible file: {source}")
        
        # Read Excel using pandas
        df = pd.read_excel(
            source,
            sheet_name=sheet_name,
            header=header,
            na_values=na_values,
            **options
        )
        
        # Handle multiple sheets
        if isinstance(df, dict):
            # Return the first sheet if multiple sheets are found
            sheet_name = next(iter(df))
            df = df[sheet_name]
        
        # Convert DataFrame to dictionary of numpy arrays
        data_dict = {}
        for column in df.columns:
            data_dict[str(column)] = df[column].to_numpy()
        
        return data_dict





[docs]
class ParquetReader(FileReader):
    """Reader for Parquet files."""
    

[docs]
    def read(
        self,
        source: str,
        columns: Optional[List[str]] = None,
        **options
    ) -> Dict[str, np.ndarray]:
        """
        Read data from a Parquet file.
        
        Args:
            source: File path
            columns: List of columns to read (None for all)
            **options: Additional pandas.read_parquet options
            
        Returns:
            Dictionary mapping column names to data arrays
        """
        assert isinstance(source, str), "source must be a string"
        if columns is not None:
            assert isinstance(columns, list), "columns must be a list"
            for col in columns:
                assert isinstance(col, str), "Each column name must be a string"
        
        if not self.validate_source(source):
            raise ValueError(f"Invalid or inaccessible file: {source}")
        
        try:
            # Try to read Parquet using pandas
            df = pd.read_parquet(
                source,
                columns=columns,
                **options
            )
            
            # Convert DataFrame to dictionary of numpy arrays
            data_dict = {}
            for column in df.columns:
                data_dict[str(column)] = df[column].to_numpy()
            
            return data_dict
            
        except ImportError:
            raise ImportError("pyarrow or fastparquet is required for reading Parquet files")





[docs]
class HDF5Reader(FileReader):
    """Reader for HDF5 files."""
    

[docs]
    def read(
        self,
        source: str,
        key: str,
        **options
    ) -> Dict[str, np.ndarray]:
        """
        Read data from an HDF5 file.
        
        Args:
            source: File path
            key: Group identifier in the HDF5 file
            **options: Additional pandas.read_hdf options
            
        Returns:
            Dictionary mapping column names to data arrays
        """
        assert isinstance(source, str), "source must be a string"
        assert isinstance(key, str), "key must be a string"
        
        if not self.validate_source(source):
            raise ValueError(f"Invalid or inaccessible file: {source}")
        
        try:
            # Try to read HDF5 using pandas
            df = pd.read_hdf(
                source,
                key=key,
                **options
            )
            
            # Convert DataFrame to dictionary of numpy arrays
            data_dict = {}
            for column in df.columns:
                data_dict[str(column)] = df[column].to_numpy()
            
            return data_dict
            
        except ImportError:
            raise ImportError("tables is required for reading HDF5 files")




# Factory function to create readers for different file types

[docs]
def get_reader(file_type: str, **options) -> DataReader:
    """
    Get a reader for the specified file type.
    
    Args:
        file_type: Type of file ('csv', 'json', 'excel', 'parquet', 'hdf5')
        **options: Additional options for the reader
        
    Returns:
        Appropriate DataReader instance
    """
    assert isinstance(file_type, str), "file_type must be a string"
    file_type = file_type.lower()
    
    if file_type == 'csv':
        return CSVReader(**options)
    elif file_type == 'json':
        return JSONReader(**options)
    elif file_type == 'excel' or file_type == 'xlsx' or file_type == 'xls':
        return ExcelReader(**options)
    elif file_type == 'parquet':
        return ParquetReader(**options)
    elif file_type == 'hdf5' or file_type == 'h5':
        return HDF5Reader(**options)
    else:
        raise ValueError(f"Unsupported file type: {file_type}")



# Convenience functions for reading different file types

[docs]
def read_csv(
    filepath: str,
    delimiter: str = ",",
    header: bool = True,
    **options
) -> Dict[str, np.ndarray]:
    """
    Read data from a CSV file.
    
    Args:
        filepath: Path to the CSV file
        delimiter: Field delimiter
        header: Whether to use the first row as column names
        **options: Additional reading options
        
    Returns:
        Dictionary mapping column names to data arrays
    """
    assert isinstance(filepath, str), "filepath must be a string"
    assert isinstance(delimiter, str), "delimiter must be a string"
    assert len(delimiter) == 1, "delimiter must be a single character"
    assert isinstance(header, bool), "header must be a boolean"
    
    reader = CSVReader(delimiter=delimiter)
    return reader.read(filepath, header=header, **options)




[docs]
def read_json(
    filepath: str,
    orient: str = "columns",
    **options
) -> Dict[str, np.ndarray]:
    """
    Read data from a JSON file.
    
    Args:
        filepath: Path to the JSON file
        orient: Expected JSON dict format
        **options: Additional reading options
        
    Returns:
        Dictionary mapping column names to data arrays
    """
    assert isinstance(filepath, str), "filepath must be a string"
    assert isinstance(orient, str), "orient must be a string"
    assert orient in ["columns", "records", "index", "split", "values"], \
        "orient must be one of ['columns', 'records', 'index', 'split', 'values']"
    
    reader = JSONReader()
    return reader.read(filepath, orient=orient, **options)




[docs]
def read_excel(
    filepath: str,
    sheet_name: Union[str, int, List, None] = 0,
    **options
) -> Dict[str, np.ndarray]:
    """
    Read data from an Excel file.
    
    Args:
        filepath: Path to the Excel file
        sheet_name: Name, index, or list of sheets to read
        **options: Additional reading options
        
    Returns:
        Dictionary mapping column names to data arrays
    """
    assert isinstance(filepath, str), "filepath must be a string"
    
    reader = ExcelReader()
    return reader.read(filepath, sheet_name=sheet_name, **options)




[docs]
def read_parquet(
    filepath: str,
    columns: Optional[List[str]] = None,
    **options
) -> Dict[str, np.ndarray]:
    """
    Read data from a Parquet file.
    
    Args:
        filepath: Path to the Parquet file
        columns: List of columns to read (None for all)
        **options: Additional reading options
        
    Returns:
        Dictionary mapping column names to data arrays
    """
    assert isinstance(filepath, str), "filepath must be a string"
    if columns is not None:
        assert isinstance(columns, list), "columns must be a list"
        for col in columns:
            assert isinstance(col, str), f"Each column name must be a string, got {type(col)}"
    
    reader = ParquetReader()
    return reader.read(filepath, columns=columns, **options)




[docs]
def read_hdf5(
    filepath: str,
    key: str,
    **options
) -> Dict[str, np.ndarray]:
    """
    Read data from an HDF5 file.
    
    Args:
        filepath: Path to the HDF5 file
        key: Group identifier in the HDF5 file
        **options: Additional reading options
        
    Returns:
        Dictionary mapping column names to data arrays
    """
    assert isinstance(filepath, str), "filepath must be a string"
    assert isinstance(key, str), "key must be a string"
    
    reader = HDF5Reader()
    return reader.read(filepath, key=key, **options)