May 30, 2024
data = pd.read_csv('housing.csv')
data.head()
data.info()
data.describe()
data.dropna(inplace=True)
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train['total_rooms'] = np.log1p(X_train['total_rooms'])
X_train['total_bedrooms'] = np.log1p(X_train['total_bedrooms'])
X_train['population'] = np.log1p(X_train['population'])
X_train['households'] = np.log1p(X_train['households'])
X_train = pd.get_dummies(X_train, columns=['ocean_proximity'])
X_test = pd.get_dummies(X_test, columns=['ocean_proximity'])
Ensure both training and testing data have the same dummy columns.train_data = X_train.join(y_train)
train_data['bedroom_ratio'] = train_data['total_bedrooms'] / train_data['total_rooms']
train_data['household_rooms'] = train_data['total_rooms'] / train_data['households']
sns.heatmap(train_data.corr(), annot=True)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
forest.fit(X_train, y_train)
forest.score(X_test, y_test)
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators': [100, 200, 300],
'max_features': ['auto', 'sqrt', 'log2'],
'min_samples_split': [2, 5, 10],
'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_forest = grid_search.best_estimator_
best_forest.score(X_test, y_test)
End of the Lecture.