src.data

View Source

  1import pandas as pd
  2import numpy as np
  3from sklearn.preprocessing import StandardScaler, LabelEncoder
  4from sklearn.model_selection import train_test_split
  5from sklearn.impute import SimpleImputer
  6import os
  7import urllib.request
  8import torch
  9from torch.utils.data import Dataset
 10import tempfile
 11
 12class HepatitisDataset(Dataset):
 13    """
 14    Custom PyTorch Dataset for Hepatitis data.
 15    
 16    This dataset can be reused with different models and training approaches.
 17
 18    Parameters
 19    -----------
 20    X : np.ndarray or pd.DataFrame
 21        Feature matrix.
 22    y : np.ndarray or pd.Series    
 23        Target vector.
 24
 25    Attributes
 26    -----------
 27    X : torch.FloatTensor
 28        Feature matrix as a FloatTensor.
 29    y : torch.LongTensor
 30        Target vector as a LongTensor.
 31        
 32    Examples
 33    ---------
 34    >>> from src.data import HepatitisDataset
 35    >>> dataset = HepatitisDataset(X_train, y_train)
 36    >>> loader = DataLoader(dataset, batch_size=32, shuffle=True)
 37    """
 38
 39    def __init__(self, X: np.ndarray, y: np.ndarray):
 40        self.X = torch.FloatTensor(X)
 41        self.y = torch.LongTensor(y.values if hasattr(y, 'values') else y)
 42    
 43    def __len__(self):
 44        return len(self.X)
 45    
 46    def __getitem__(self, idx: int):
 47        return self.X[idx], self.y[idx]
 48
 49def download_dataset(target_path: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> bool:
 50    """
 51    Download the Hepatitis C dataset from a public source if not already present.
 52    
 53    Parameters
 54    ------------
 55    target_path : str
 56        Path where the dataset should be saved.
 57    demo : bool
 58        If True, makes tempdirs and tempfiles deletable after use.
 59        
 60    Returns
 61    ------------
 62    bool
 63        True if download was successful or file already exists, False otherwise.
 64        
 65    Examples
 66    ---------
 67    >>> download_dataset()
 68    True
 69    """
 70    if demo:
 71        target_path = os.path.join(tempfile.gettempdir(), 'hepatitis_data.csv')
 72        print(f"Using temporary dataset path: {target_path}")
 73
 74    else:
 75        if os.path.exists(target_path):
 76            print(f"Dataset already exists at: {target_path}")
 77            return True
 78
 79        # Create directory if it doesn't exist
 80        os.makedirs(os.path.dirname(target_path), exist_ok=True)
 81
 82    # URL to a reliable source - using a direct CSV link
 83    # This is the UCI ML Repository version of the dataset
 84    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv"
 85    
 86    try:
 87        print(f"Downloading dataset from {url}...")
 88        urllib.request.urlretrieve(url, target_path)
 89        print(f"✅ Dataset downloaded successfully to: {target_path}")
 90        if demo:
 91            return target_path
 92        return True
 93    except Exception as e:
 94        print(f"❌ Error downloading dataset: {e}")
 95        print("\n📌 Alternative: Download manually from:")
 96        print("   https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset")
 97        print(f"   and place it in: {target_path}")
 98        return False
 99
100def load_raw_data(filepath: str ='data/raw/hepatitis_data.csv', demo: bool = False) -> pd.DataFrame:
101    '''
102    Load raw data from a CSV file. If the file doesn't exist, attempts to download it automatically.
103    
104    Parameters
105    ------------
106    filepath : str
107        Path to the CSV file to be loaded.
108    demo : bool
109        If True, uses a temporary file path for demo purposes.
110
111    Returns
112    ------------
113    pd.DataFrame
114        Loaded dataset as a pandas DataFrame.   
115
116    Examples
117    ---------
118    >>> df = load_raw_data()
119    >>> df.head()
120    '''
121
122    try:
123        df = pd.read_csv(filepath)
124        # Rename the unnamed index column to Patient ID
125        if 'Unnamed: 0' in df.columns:
126            df = df.rename(columns={'Unnamed: 0': 'Patient ID'})
127        print(f"Dataset loaded successfully: {df.shape}")
128        return df
129    except FileNotFoundError:
130        print(f"File not found: {filepath}")
131        print("Attempting to download dataset automatically...")
132        
133        if download_dataset(filepath):
134            # Try loading again after download
135            try:
136                df = pd.read_csv(filepath)
137                if 'Unnamed: 0' in df.columns:
138                    df = df.rename(columns={'Unnamed: 0': 'Patient ID'})
139                print(f"Dataset loaded successfully: {df.shape}")
140                return df
141            except Exception as e:
142                print(f"Error loading downloaded dataset: {e}")
143                return None
144        else:
145            print("Please download the dataset manually from Kaggle and place it in data/raw/")
146            return None
147
148def get_data_info(df):
149    if df is None:  
150        return None
151    
152    info = {
153        'shape': df.shape,
154        'columns': list(df.columns),
155        'missing_values': df.isnull().sum(),
156        'target_distribution': df['Category'].value_counts() if 'Category' in df.columns else None,
157        'data_types': df.dtypes
158    }
159    
160    return info
161
162def clean_data(df: pd.DataFrame) -> pd.DataFrame:
163    """
164    Clean and preprocess the dataset.
165    
166    Parameters
167    ------------
168    df : pd.DataFrame
169        Raw dataset to be cleaned.
170        
171    Returns
172    ------------
173    pd.DataFrame
174        Cleaned dataset with necessary transformations applied.
175
176    Examples
177    ---------
178    >>> cleaned_df = clean_data(df)
179    >>> cleaned_df.head()
180    """
181
182    if df is None:
183        return None
184    data = df.copy()
185
186    # Keep Patient ID column for identification
187    # Remove it only if needed for modeling
188    if 'Unnamed: 0' in data.columns:
189        data = data.drop('Unnamed: 0', axis=1)
190    
191    def simplify_category(category: str) -> int:
192        """
193        Simplify the category labels.
194
195        Parameters
196        ------------
197        category : str
198            Original category label.
199
200        Returns
201        ------------
202        int
203            Simplified category label. 0 for healthy, 1 for hepatitis C.
204
205        Examples
206        ---------
207        >>> simplify_category('0=Blood Donor')
208        0
209        >>> simplify_category('1=Hepatitis C')
210        1
211        """
212
213        if category in ['0=Blood Donor', '0s=suspect Blood Donor']:
214            return 0
215        else:
216            return 1
217    
218    data['target'] = data['Category'].apply(simplify_category)
219    
220    sex_encoder = LabelEncoder()
221    data['sex_encoded'] = sex_encoder.fit_transform(data['Sex'])
222    
223    print(f"Data cleaned successfully")
224    print(f"Healthy: {sum(data['target'] == 0)} samples")
225    print(f"Hepatitis C: {sum(data['target'] == 1)} samples")
226    
227    return data, sex_encoder
228
229def prepare_features(data: pd.DataFrame) -> pd.DataFrame:
230    """
231    Prepare features for modeling.
232
233    Parameters
234    ------------
235    data : pd.DataFrame
236        Cleaned dataset with necessary transformations applied.
237
238    Returns
239    ------------
240    pd.DataFrame
241        Feature matrix ready for modeling.
242
243    Examples
244    ---------
245    >>> prepared_features = prepare_features(cleaned_df)
246    >>> prepared_features.head()
247    """
248
249    if data is None:
250        return None
251    feature_columns = ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'sex_encoded']
252    
253    X = data[feature_columns]
254    y = data['target']
255
256    imputer = SimpleImputer(strategy='median')
257    X_imputed = pd.DataFrame(
258        imputer.fit_transform(X), 
259        columns=X.columns, 
260        index=X.index
261    )
262    
263    print(f"Features prepared: {X_imputed.shape}")
264    print(f"Missing values after imputation: {X_imputed.isnull().sum().sum()}")
265    
266    return X_imputed, y, imputer
267
268def split_and_scale_data(X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = 42) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series, StandardScaler]:
269    """
270    Split and scale the dataset.
271
272    Parameters
273    ------------
274    X : pd.DataFrame
275        Feature matrix.
276    y : pd.Series
277        Target vector.
278    test_size : float
279        Proportion of the dataset to include in the test split.
280    random_state : int
281        Random seed for reproducibility.
282
283    Returns
284    ------------
285    tuple
286        (X_train_scaled, X_test_scaled, y_train, y_test, scaler)
287    """
288
289    X_train, X_test, y_train, y_test = train_test_split(
290        X, y, test_size=test_size, random_state=random_state, stratify=y
291    )
292    scaler = StandardScaler()
293    X_train_scaled = scaler.fit_transform(X_train)
294    X_test_scaled = scaler.transform(X_test)
295    
296    print(f"✅ Data split and scaled:")
297    print(f"   Training set: {X_train_scaled.shape}")
298    print(f"   Test set: {X_test_scaled.shape}")
299    
300    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

class HepatitisDataset(typing.Generic[+_T_co]): View Source

13class HepatitisDataset(Dataset):
14    """
15    Custom PyTorch Dataset for Hepatitis data.
16    
17    This dataset can be reused with different models and training approaches.
18
19    Parameters
20    -----------
21    X : np.ndarray or pd.DataFrame
22        Feature matrix.
23    y : np.ndarray or pd.Series    
24        Target vector.
25
26    Attributes
27    -----------
28    X : torch.FloatTensor
29        Feature matrix as a FloatTensor.
30    y : torch.LongTensor
31        Target vector as a LongTensor.
32        
33    Examples
34    ---------
35    >>> from src.data import HepatitisDataset
36    >>> dataset = HepatitisDataset(X_train, y_train)
37    >>> loader = DataLoader(dataset, batch_size=32, shuffle=True)
38    """
39
40    def __init__(self, X: np.ndarray, y: np.ndarray):
41        self.X = torch.FloatTensor(X)
42        self.y = torch.LongTensor(y.values if hasattr(y, 'values') else y)
43    
44    def __len__(self):
45        return len(self.X)
46    
47    def __getitem__(self, idx: int):
48        return self.X[idx], self.y[idx]

Custom PyTorch Dataset for Hepatitis data.

This dataset can be reused with different models and training approaches.

Parameters

X (np.ndarray or pd.DataFrame): Feature matrix.
y (np.ndarray or pd.Series): Target vector.

Attributes

X (torch.FloatTensor): Feature matrix as a FloatTensor.
y (torch.LongTensor): Target vector as a LongTensor.

Examples

>>> from src.data import HepatitisDataset
>>> dataset = HepatitisDataset(X_train, y_train)
>>> loader = DataLoader(dataset, batch_size=32, shuffle=True)

HepatitisDataset(X: numpy.ndarray, y: numpy.ndarray) View Source

40    def __init__(self, X: np.ndarray, y: np.ndarray):
41        self.X = torch.FloatTensor(X)
42        self.y = torch.LongTensor(y.values if hasattr(y, 'values') else y)

def download_dataset( target_path: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> bool: View Source

50def download_dataset(target_path: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> bool:
51    """
52    Download the Hepatitis C dataset from a public source if not already present.
53    
54    Parameters
55    ------------
56    target_path : str
57        Path where the dataset should be saved.
58    demo : bool
59        If True, makes tempdirs and tempfiles deletable after use.
60        
61    Returns
62    ------------
63    bool
64        True if download was successful or file already exists, False otherwise.
65        
66    Examples
67    ---------
68    >>> download_dataset()
69    True
70    """
71    if demo:
72        target_path = os.path.join(tempfile.gettempdir(), 'hepatitis_data.csv')
73        print(f"Using temporary dataset path: {target_path}")
74
75    else:
76        if os.path.exists(target_path):
77            print(f"Dataset already exists at: {target_path}")
78            return True
79
80        # Create directory if it doesn't exist
81        os.makedirs(os.path.dirname(target_path), exist_ok=True)
82
83    # URL to a reliable source - using a direct CSV link
84    # This is the UCI ML Repository version of the dataset
85    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv"
86    
87    try:
88        print(f"Downloading dataset from {url}...")
89        urllib.request.urlretrieve(url, target_path)
90        print(f"✅ Dataset downloaded successfully to: {target_path}")
91        if demo:
92            return target_path
93        return True
94    except Exception as e:
95        print(f"❌ Error downloading dataset: {e}")
96        print("\n📌 Alternative: Download manually from:")
97        print("   https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset")
98        print(f"   and place it in: {target_path}")
99        return False

Download the Hepatitis C dataset from a public source if not already present.

Parameters

target_path (str): Path where the dataset should be saved.
demo (bool): If True, makes tempdirs and tempfiles deletable after use.

Returns

bool: True if download was successful or file already exists, False otherwise.

Examples

>>> download_dataset()
True

def load_raw_data( filepath: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> pandas.core.frame.DataFrame: View Source

101def load_raw_data(filepath: str ='data/raw/hepatitis_data.csv', demo: bool = False) -> pd.DataFrame:
102    '''
103    Load raw data from a CSV file. If the file doesn't exist, attempts to download it automatically.
104    
105    Parameters
106    ------------
107    filepath : str
108        Path to the CSV file to be loaded.
109    demo : bool
110        If True, uses a temporary file path for demo purposes.
111
112    Returns
113    ------------
114    pd.DataFrame
115        Loaded dataset as a pandas DataFrame.   
116
117    Examples
118    ---------
119    >>> df = load_raw_data()
120    >>> df.head()
121    '''
122
123    try:
124        df = pd.read_csv(filepath)
125        # Rename the unnamed index column to Patient ID
126        if 'Unnamed: 0' in df.columns:
127            df = df.rename(columns={'Unnamed: 0': 'Patient ID'})
128        print(f"Dataset loaded successfully: {df.shape}")
129        return df
130    except FileNotFoundError:
131        print(f"File not found: {filepath}")
132        print("Attempting to download dataset automatically...")
133        
134        if download_dataset(filepath):
135            # Try loading again after download
136            try:
137                df = pd.read_csv(filepath)
138                if 'Unnamed: 0' in df.columns:
139                    df = df.rename(columns={'Unnamed: 0': 'Patient ID'})
140                print(f"Dataset loaded successfully: {df.shape}")
141                return df
142            except Exception as e:
143                print(f"Error loading downloaded dataset: {e}")
144                return None
145        else:
146            print("Please download the dataset manually from Kaggle and place it in data/raw/")
147            return None

Load raw data from a CSV file. If the file doesn't exist, attempts to download it automatically.

Parameters

filepath (str): Path to the CSV file to be loaded.
demo (bool): If True, uses a temporary file path for demo purposes.

Returns

pd.DataFrame: Loaded dataset as a pandas DataFrame.

Examples

>>> df = load_raw_data()
>>> df.head()

def get_data_info(df): View Source

149def get_data_info(df):
150    if df is None:  
151        return None
152    
153    info = {
154        'shape': df.shape,
155        'columns': list(df.columns),
156        'missing_values': df.isnull().sum(),
157        'target_distribution': df['Category'].value_counts() if 'Category' in df.columns else None,
158        'data_types': df.dtypes
159    }
160    
161    return info

def clean_data(df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame: View Source

163def clean_data(df: pd.DataFrame) -> pd.DataFrame:
164    """
165    Clean and preprocess the dataset.
166    
167    Parameters
168    ------------
169    df : pd.DataFrame
170        Raw dataset to be cleaned.
171        
172    Returns
173    ------------
174    pd.DataFrame
175        Cleaned dataset with necessary transformations applied.
176
177    Examples
178    ---------
179    >>> cleaned_df = clean_data(df)
180    >>> cleaned_df.head()
181    """
182
183    if df is None:
184        return None
185    data = df.copy()
186
187    # Keep Patient ID column for identification
188    # Remove it only if needed for modeling
189    if 'Unnamed: 0' in data.columns:
190        data = data.drop('Unnamed: 0', axis=1)
191    
192    def simplify_category(category: str) -> int:
193        """
194        Simplify the category labels.
195
196        Parameters
197        ------------
198        category : str
199            Original category label.
200
201        Returns
202        ------------
203        int
204            Simplified category label. 0 for healthy, 1 for hepatitis C.
205
206        Examples
207        ---------
208        >>> simplify_category('0=Blood Donor')
209        0
210        >>> simplify_category('1=Hepatitis C')
211        1
212        """
213
214        if category in ['0=Blood Donor', '0s=suspect Blood Donor']:
215            return 0
216        else:
217            return 1
218    
219    data['target'] = data['Category'].apply(simplify_category)
220    
221    sex_encoder = LabelEncoder()
222    data['sex_encoded'] = sex_encoder.fit_transform(data['Sex'])
223    
224    print(f"Data cleaned successfully")
225    print(f"Healthy: {sum(data['target'] == 0)} samples")
226    print(f"Hepatitis C: {sum(data['target'] == 1)} samples")
227    
228    return data, sex_encoder

Clean and preprocess the dataset.

Parameters

df (pd.DataFrame): Raw dataset to be cleaned.

Returns

pd.DataFrame: Cleaned dataset with necessary transformations applied.

Examples

>>> cleaned_df = clean_data(df)
>>> cleaned_df.head()

def prepare_features(data: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame: View Source

230def prepare_features(data: pd.DataFrame) -> pd.DataFrame:
231    """
232    Prepare features for modeling.
233
234    Parameters
235    ------------
236    data : pd.DataFrame
237        Cleaned dataset with necessary transformations applied.
238
239    Returns
240    ------------
241    pd.DataFrame
242        Feature matrix ready for modeling.
243
244    Examples
245    ---------
246    >>> prepared_features = prepare_features(cleaned_df)
247    >>> prepared_features.head()
248    """
249
250    if data is None:
251        return None
252    feature_columns = ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'sex_encoded']
253    
254    X = data[feature_columns]
255    y = data['target']
256
257    imputer = SimpleImputer(strategy='median')
258    X_imputed = pd.DataFrame(
259        imputer.fit_transform(X), 
260        columns=X.columns, 
261        index=X.index
262    )
263    
264    print(f"Features prepared: {X_imputed.shape}")
265    print(f"Missing values after imputation: {X_imputed.isnull().sum().sum()}")
266    
267    return X_imputed, y, imputer

Prepare features for modeling.

Parameters

data (pd.DataFrame): Cleaned dataset with necessary transformations applied.

Returns

pd.DataFrame: Feature matrix ready for modeling.

Examples

>>> prepared_features = prepare_features(cleaned_df)
>>> prepared_features.head()

def split_and_scale_data( X: pandas.core.frame.DataFrame, y: pandas.core.series.Series, test_size: float = 0.2, random_state: int = 42) -> tuple[numpy.ndarray, numpy.ndarray, pandas.core.series.Series, pandas.core.series.Series, sklearn.preprocessing._data.StandardScaler]: View Source

269def split_and_scale_data(X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = 42) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series, StandardScaler]:
270    """
271    Split and scale the dataset.
272
273    Parameters
274    ------------
275    X : pd.DataFrame
276        Feature matrix.
277    y : pd.Series
278        Target vector.
279    test_size : float
280        Proportion of the dataset to include in the test split.
281    random_state : int
282        Random seed for reproducibility.
283
284    Returns
285    ------------
286    tuple
287        (X_train_scaled, X_test_scaled, y_train, y_test, scaler)
288    """
289
290    X_train, X_test, y_train, y_test = train_test_split(
291        X, y, test_size=test_size, random_state=random_state, stratify=y
292    )
293    scaler = StandardScaler()
294    X_train_scaled = scaler.fit_transform(X_train)
295    X_test_scaled = scaler.transform(X_test)
296    
297    print(f"✅ Data split and scaled:")
298    print(f"   Training set: {X_train_scaled.shape}")
299    print(f"   Test set: {X_test_scaled.shape}")
300    
301    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

Split and scale the dataset.

Parameters

X (pd.DataFrame): Feature matrix.
y (pd.Series): Target vector.
test_size (float): Proportion of the dataset to include in the test split.
random_state (int): Random seed for reproducibility.

Returns

tuple: (X_train_scaled, X_test_scaled, y_train, y_test, scaler)