src.data
1import pandas as pd 2import numpy as np 3from sklearn.preprocessing import StandardScaler, LabelEncoder 4from sklearn.model_selection import train_test_split 5from sklearn.impute import SimpleImputer 6import os 7import urllib.request 8import torch 9from torch.utils.data import Dataset 10import tempfile 11 12class HepatitisDataset(Dataset): 13 """ 14 Custom PyTorch Dataset for Hepatitis data. 15 16 This dataset can be reused with different models and training approaches. 17 18 Parameters 19 ----------- 20 X : np.ndarray or pd.DataFrame 21 Feature matrix. 22 y : np.ndarray or pd.Series 23 Target vector. 24 25 Attributes 26 ----------- 27 X : torch.FloatTensor 28 Feature matrix as a FloatTensor. 29 y : torch.LongTensor 30 Target vector as a LongTensor. 31 32 Examples 33 --------- 34 >>> from src.data import HepatitisDataset 35 >>> dataset = HepatitisDataset(X_train, y_train) 36 >>> loader = DataLoader(dataset, batch_size=32, shuffle=True) 37 """ 38 39 def __init__(self, X: np.ndarray, y: np.ndarray): 40 self.X = torch.FloatTensor(X) 41 self.y = torch.LongTensor(y.values if hasattr(y, 'values') else y) 42 43 def __len__(self): 44 return len(self.X) 45 46 def __getitem__(self, idx: int): 47 return self.X[idx], self.y[idx] 48 49def download_dataset(target_path: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> bool: 50 """ 51 Download the Hepatitis C dataset from a public source if not already present. 52 53 Parameters 54 ------------ 55 target_path : str 56 Path where the dataset should be saved. 57 demo : bool 58 If True, makes tempdirs and tempfiles deletable after use. 59 60 Returns 61 ------------ 62 bool 63 True if download was successful or file already exists, False otherwise. 64 65 Examples 66 --------- 67 >>> download_dataset() 68 True 69 """ 70 if demo: 71 target_path = os.path.join(tempfile.gettempdir(), 'hepatitis_data.csv') 72 print(f"Using temporary dataset path: {target_path}") 73 74 else: 75 if os.path.exists(target_path): 76 print(f"Dataset already exists at: {target_path}") 77 return True 78 79 # Create directory if it doesn't exist 80 os.makedirs(os.path.dirname(target_path), exist_ok=True) 81 82 # URL to a reliable source - using a direct CSV link 83 # This is the UCI ML Repository version of the dataset 84 url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv" 85 86 try: 87 print(f"Downloading dataset from {url}...") 88 urllib.request.urlretrieve(url, target_path) 89 print(f"✅ Dataset downloaded successfully to: {target_path}") 90 if demo: 91 return target_path 92 return True 93 except Exception as e: 94 print(f"❌ Error downloading dataset: {e}") 95 print("\n📌 Alternative: Download manually from:") 96 print(" https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset") 97 print(f" and place it in: {target_path}") 98 return False 99 100def load_raw_data(filepath: str ='data/raw/hepatitis_data.csv', demo: bool = False) -> pd.DataFrame: 101 ''' 102 Load raw data from a CSV file. If the file doesn't exist, attempts to download it automatically. 103 104 Parameters 105 ------------ 106 filepath : str 107 Path to the CSV file to be loaded. 108 demo : bool 109 If True, uses a temporary file path for demo purposes. 110 111 Returns 112 ------------ 113 pd.DataFrame 114 Loaded dataset as a pandas DataFrame. 115 116 Examples 117 --------- 118 >>> df = load_raw_data() 119 >>> df.head() 120 ''' 121 122 try: 123 df = pd.read_csv(filepath) 124 # Rename the unnamed index column to Patient ID 125 if 'Unnamed: 0' in df.columns: 126 df = df.rename(columns={'Unnamed: 0': 'Patient ID'}) 127 print(f"Dataset loaded successfully: {df.shape}") 128 return df 129 except FileNotFoundError: 130 print(f"File not found: {filepath}") 131 print("Attempting to download dataset automatically...") 132 133 if download_dataset(filepath): 134 # Try loading again after download 135 try: 136 df = pd.read_csv(filepath) 137 if 'Unnamed: 0' in df.columns: 138 df = df.rename(columns={'Unnamed: 0': 'Patient ID'}) 139 print(f"Dataset loaded successfully: {df.shape}") 140 return df 141 except Exception as e: 142 print(f"Error loading downloaded dataset: {e}") 143 return None 144 else: 145 print("Please download the dataset manually from Kaggle and place it in data/raw/") 146 return None 147 148def get_data_info(df): 149 if df is None: 150 return None 151 152 info = { 153 'shape': df.shape, 154 'columns': list(df.columns), 155 'missing_values': df.isnull().sum(), 156 'target_distribution': df['Category'].value_counts() if 'Category' in df.columns else None, 157 'data_types': df.dtypes 158 } 159 160 return info 161 162def clean_data(df: pd.DataFrame) -> pd.DataFrame: 163 """ 164 Clean and preprocess the dataset. 165 166 Parameters 167 ------------ 168 df : pd.DataFrame 169 Raw dataset to be cleaned. 170 171 Returns 172 ------------ 173 pd.DataFrame 174 Cleaned dataset with necessary transformations applied. 175 176 Examples 177 --------- 178 >>> cleaned_df = clean_data(df) 179 >>> cleaned_df.head() 180 """ 181 182 if df is None: 183 return None 184 data = df.copy() 185 186 # Keep Patient ID column for identification 187 # Remove it only if needed for modeling 188 if 'Unnamed: 0' in data.columns: 189 data = data.drop('Unnamed: 0', axis=1) 190 191 def simplify_category(category: str) -> int: 192 """ 193 Simplify the category labels. 194 195 Parameters 196 ------------ 197 category : str 198 Original category label. 199 200 Returns 201 ------------ 202 int 203 Simplified category label. 0 for healthy, 1 for hepatitis C. 204 205 Examples 206 --------- 207 >>> simplify_category('0=Blood Donor') 208 0 209 >>> simplify_category('1=Hepatitis C') 210 1 211 """ 212 213 if category in ['0=Blood Donor', '0s=suspect Blood Donor']: 214 return 0 215 else: 216 return 1 217 218 data['target'] = data['Category'].apply(simplify_category) 219 220 sex_encoder = LabelEncoder() 221 data['sex_encoded'] = sex_encoder.fit_transform(data['Sex']) 222 223 print(f"Data cleaned successfully") 224 print(f"Healthy: {sum(data['target'] == 0)} samples") 225 print(f"Hepatitis C: {sum(data['target'] == 1)} samples") 226 227 return data, sex_encoder 228 229def prepare_features(data: pd.DataFrame) -> pd.DataFrame: 230 """ 231 Prepare features for modeling. 232 233 Parameters 234 ------------ 235 data : pd.DataFrame 236 Cleaned dataset with necessary transformations applied. 237 238 Returns 239 ------------ 240 pd.DataFrame 241 Feature matrix ready for modeling. 242 243 Examples 244 --------- 245 >>> prepared_features = prepare_features(cleaned_df) 246 >>> prepared_features.head() 247 """ 248 249 if data is None: 250 return None 251 feature_columns = ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'sex_encoded'] 252 253 X = data[feature_columns] 254 y = data['target'] 255 256 imputer = SimpleImputer(strategy='median') 257 X_imputed = pd.DataFrame( 258 imputer.fit_transform(X), 259 columns=X.columns, 260 index=X.index 261 ) 262 263 print(f"Features prepared: {X_imputed.shape}") 264 print(f"Missing values after imputation: {X_imputed.isnull().sum().sum()}") 265 266 return X_imputed, y, imputer 267 268def split_and_scale_data(X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = 42) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series, StandardScaler]: 269 """ 270 Split and scale the dataset. 271 272 Parameters 273 ------------ 274 X : pd.DataFrame 275 Feature matrix. 276 y : pd.Series 277 Target vector. 278 test_size : float 279 Proportion of the dataset to include in the test split. 280 random_state : int 281 Random seed for reproducibility. 282 283 Returns 284 ------------ 285 tuple 286 (X_train_scaled, X_test_scaled, y_train, y_test, scaler) 287 """ 288 289 X_train, X_test, y_train, y_test = train_test_split( 290 X, y, test_size=test_size, random_state=random_state, stratify=y 291 ) 292 scaler = StandardScaler() 293 X_train_scaled = scaler.fit_transform(X_train) 294 X_test_scaled = scaler.transform(X_test) 295 296 print(f"✅ Data split and scaled:") 297 print(f" Training set: {X_train_scaled.shape}") 298 print(f" Test set: {X_test_scaled.shape}") 299 300 return X_train_scaled, X_test_scaled, y_train, y_test, scaler
class
HepatitisDataset(typing.Generic[+_T_co]):
13class HepatitisDataset(Dataset): 14 """ 15 Custom PyTorch Dataset for Hepatitis data. 16 17 This dataset can be reused with different models and training approaches. 18 19 Parameters 20 ----------- 21 X : np.ndarray or pd.DataFrame 22 Feature matrix. 23 y : np.ndarray or pd.Series 24 Target vector. 25 26 Attributes 27 ----------- 28 X : torch.FloatTensor 29 Feature matrix as a FloatTensor. 30 y : torch.LongTensor 31 Target vector as a LongTensor. 32 33 Examples 34 --------- 35 >>> from src.data import HepatitisDataset 36 >>> dataset = HepatitisDataset(X_train, y_train) 37 >>> loader = DataLoader(dataset, batch_size=32, shuffle=True) 38 """ 39 40 def __init__(self, X: np.ndarray, y: np.ndarray): 41 self.X = torch.FloatTensor(X) 42 self.y = torch.LongTensor(y.values if hasattr(y, 'values') else y) 43 44 def __len__(self): 45 return len(self.X) 46 47 def __getitem__(self, idx: int): 48 return self.X[idx], self.y[idx]
Custom PyTorch Dataset for Hepatitis data.
This dataset can be reused with different models and training approaches.
Parameters
- X (np.ndarray or pd.DataFrame): Feature matrix.
- y (np.ndarray or pd.Series): Target vector.
Attributes
- X (torch.FloatTensor): Feature matrix as a FloatTensor.
- y (torch.LongTensor): Target vector as a LongTensor.
Examples
>>> from src.data import HepatitisDataset
>>> dataset = HepatitisDataset(X_train, y_train)
>>> loader = DataLoader(dataset, batch_size=32, shuffle=True)
def
download_dataset( target_path: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> bool:
50def download_dataset(target_path: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> bool: 51 """ 52 Download the Hepatitis C dataset from a public source if not already present. 53 54 Parameters 55 ------------ 56 target_path : str 57 Path where the dataset should be saved. 58 demo : bool 59 If True, makes tempdirs and tempfiles deletable after use. 60 61 Returns 62 ------------ 63 bool 64 True if download was successful or file already exists, False otherwise. 65 66 Examples 67 --------- 68 >>> download_dataset() 69 True 70 """ 71 if demo: 72 target_path = os.path.join(tempfile.gettempdir(), 'hepatitis_data.csv') 73 print(f"Using temporary dataset path: {target_path}") 74 75 else: 76 if os.path.exists(target_path): 77 print(f"Dataset already exists at: {target_path}") 78 return True 79 80 # Create directory if it doesn't exist 81 os.makedirs(os.path.dirname(target_path), exist_ok=True) 82 83 # URL to a reliable source - using a direct CSV link 84 # This is the UCI ML Repository version of the dataset 85 url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00571/hcvdat0.csv" 86 87 try: 88 print(f"Downloading dataset from {url}...") 89 urllib.request.urlretrieve(url, target_path) 90 print(f"✅ Dataset downloaded successfully to: {target_path}") 91 if demo: 92 return target_path 93 return True 94 except Exception as e: 95 print(f"❌ Error downloading dataset: {e}") 96 print("\n📌 Alternative: Download manually from:") 97 print(" https://www.kaggle.com/datasets/fedesoriano/hepatitis-c-dataset") 98 print(f" and place it in: {target_path}") 99 return False
Download the Hepatitis C dataset from a public source if not already present.
Parameters
- target_path (str): Path where the dataset should be saved.
- demo (bool): If True, makes tempdirs and tempfiles deletable after use.
Returns
- bool: True if download was successful or file already exists, False otherwise.
Examples
>>> download_dataset()
True
def
load_raw_data( filepath: str = 'data/raw/hepatitis_data.csv', demo: bool = False) -> pandas.core.frame.DataFrame:
101def load_raw_data(filepath: str ='data/raw/hepatitis_data.csv', demo: bool = False) -> pd.DataFrame: 102 ''' 103 Load raw data from a CSV file. If the file doesn't exist, attempts to download it automatically. 104 105 Parameters 106 ------------ 107 filepath : str 108 Path to the CSV file to be loaded. 109 demo : bool 110 If True, uses a temporary file path for demo purposes. 111 112 Returns 113 ------------ 114 pd.DataFrame 115 Loaded dataset as a pandas DataFrame. 116 117 Examples 118 --------- 119 >>> df = load_raw_data() 120 >>> df.head() 121 ''' 122 123 try: 124 df = pd.read_csv(filepath) 125 # Rename the unnamed index column to Patient ID 126 if 'Unnamed: 0' in df.columns: 127 df = df.rename(columns={'Unnamed: 0': 'Patient ID'}) 128 print(f"Dataset loaded successfully: {df.shape}") 129 return df 130 except FileNotFoundError: 131 print(f"File not found: {filepath}") 132 print("Attempting to download dataset automatically...") 133 134 if download_dataset(filepath): 135 # Try loading again after download 136 try: 137 df = pd.read_csv(filepath) 138 if 'Unnamed: 0' in df.columns: 139 df = df.rename(columns={'Unnamed: 0': 'Patient ID'}) 140 print(f"Dataset loaded successfully: {df.shape}") 141 return df 142 except Exception as e: 143 print(f"Error loading downloaded dataset: {e}") 144 return None 145 else: 146 print("Please download the dataset manually from Kaggle and place it in data/raw/") 147 return None
Load raw data from a CSV file. If the file doesn't exist, attempts to download it automatically.
Parameters
- filepath (str): Path to the CSV file to be loaded.
- demo (bool): If True, uses a temporary file path for demo purposes.
Returns
- pd.DataFrame: Loaded dataset as a pandas DataFrame.
Examples
>>> df = load_raw_data()
>>> df.head()
def
get_data_info(df):
149def get_data_info(df): 150 if df is None: 151 return None 152 153 info = { 154 'shape': df.shape, 155 'columns': list(df.columns), 156 'missing_values': df.isnull().sum(), 157 'target_distribution': df['Category'].value_counts() if 'Category' in df.columns else None, 158 'data_types': df.dtypes 159 } 160 161 return info
def
clean_data(df: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
163def clean_data(df: pd.DataFrame) -> pd.DataFrame: 164 """ 165 Clean and preprocess the dataset. 166 167 Parameters 168 ------------ 169 df : pd.DataFrame 170 Raw dataset to be cleaned. 171 172 Returns 173 ------------ 174 pd.DataFrame 175 Cleaned dataset with necessary transformations applied. 176 177 Examples 178 --------- 179 >>> cleaned_df = clean_data(df) 180 >>> cleaned_df.head() 181 """ 182 183 if df is None: 184 return None 185 data = df.copy() 186 187 # Keep Patient ID column for identification 188 # Remove it only if needed for modeling 189 if 'Unnamed: 0' in data.columns: 190 data = data.drop('Unnamed: 0', axis=1) 191 192 def simplify_category(category: str) -> int: 193 """ 194 Simplify the category labels. 195 196 Parameters 197 ------------ 198 category : str 199 Original category label. 200 201 Returns 202 ------------ 203 int 204 Simplified category label. 0 for healthy, 1 for hepatitis C. 205 206 Examples 207 --------- 208 >>> simplify_category('0=Blood Donor') 209 0 210 >>> simplify_category('1=Hepatitis C') 211 1 212 """ 213 214 if category in ['0=Blood Donor', '0s=suspect Blood Donor']: 215 return 0 216 else: 217 return 1 218 219 data['target'] = data['Category'].apply(simplify_category) 220 221 sex_encoder = LabelEncoder() 222 data['sex_encoded'] = sex_encoder.fit_transform(data['Sex']) 223 224 print(f"Data cleaned successfully") 225 print(f"Healthy: {sum(data['target'] == 0)} samples") 226 print(f"Hepatitis C: {sum(data['target'] == 1)} samples") 227 228 return data, sex_encoder
Clean and preprocess the dataset.
Parameters
- df (pd.DataFrame): Raw dataset to be cleaned.
Returns
- pd.DataFrame: Cleaned dataset with necessary transformations applied.
Examples
>>> cleaned_df = clean_data(df)
>>> cleaned_df.head()
def
prepare_features(data: pandas.core.frame.DataFrame) -> pandas.core.frame.DataFrame:
230def prepare_features(data: pd.DataFrame) -> pd.DataFrame: 231 """ 232 Prepare features for modeling. 233 234 Parameters 235 ------------ 236 data : pd.DataFrame 237 Cleaned dataset with necessary transformations applied. 238 239 Returns 240 ------------ 241 pd.DataFrame 242 Feature matrix ready for modeling. 243 244 Examples 245 --------- 246 >>> prepared_features = prepare_features(cleaned_df) 247 >>> prepared_features.head() 248 """ 249 250 if data is None: 251 return None 252 feature_columns = ['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT', 'sex_encoded'] 253 254 X = data[feature_columns] 255 y = data['target'] 256 257 imputer = SimpleImputer(strategy='median') 258 X_imputed = pd.DataFrame( 259 imputer.fit_transform(X), 260 columns=X.columns, 261 index=X.index 262 ) 263 264 print(f"Features prepared: {X_imputed.shape}") 265 print(f"Missing values after imputation: {X_imputed.isnull().sum().sum()}") 266 267 return X_imputed, y, imputer
Prepare features for modeling.
Parameters
- data (pd.DataFrame): Cleaned dataset with necessary transformations applied.
Returns
- pd.DataFrame: Feature matrix ready for modeling.
Examples
>>> prepared_features = prepare_features(cleaned_df)
>>> prepared_features.head()
def
split_and_scale_data( X: pandas.core.frame.DataFrame, y: pandas.core.series.Series, test_size: float = 0.2, random_state: int = 42) -> tuple[numpy.ndarray, numpy.ndarray, pandas.core.series.Series, pandas.core.series.Series, sklearn.preprocessing._data.StandardScaler]:
269def split_and_scale_data(X: pd.DataFrame, y: pd.Series, test_size: float = 0.2, random_state: int = 42) -> tuple[np.ndarray, np.ndarray, pd.Series, pd.Series, StandardScaler]: 270 """ 271 Split and scale the dataset. 272 273 Parameters 274 ------------ 275 X : pd.DataFrame 276 Feature matrix. 277 y : pd.Series 278 Target vector. 279 test_size : float 280 Proportion of the dataset to include in the test split. 281 random_state : int 282 Random seed for reproducibility. 283 284 Returns 285 ------------ 286 tuple 287 (X_train_scaled, X_test_scaled, y_train, y_test, scaler) 288 """ 289 290 X_train, X_test, y_train, y_test = train_test_split( 291 X, y, test_size=test_size, random_state=random_state, stratify=y 292 ) 293 scaler = StandardScaler() 294 X_train_scaled = scaler.fit_transform(X_train) 295 X_test_scaled = scaler.transform(X_test) 296 297 print(f"✅ Data split and scaled:") 298 print(f" Training set: {X_train_scaled.shape}") 299 print(f" Test set: {X_test_scaled.shape}") 300 301 return X_train_scaled, X_test_scaled, y_train, y_test, scaler
Split and scale the dataset.
Parameters
- X (pd.DataFrame): Feature matrix.
- y (pd.Series): Target vector.
- test_size (float): Proportion of the dataset to include in the test split.
- random_state (int): Random seed for reproducibility.
Returns
- tuple: (X_train_scaled, X_test_scaled, y_train, y_test, scaler)