---
title: "Predict Forest Fire Burn Area"
output: html_document
---

## Setup
```{r}
# load libraries
library(tidyverse)
library(GGally)
library(caret)

# set seed to make sure that you get identical result each time running the code
set.seed(1234)

# read dataset
ff = read.delim("forestfires.tsv", header = TRUE, sep = "\t")

```
## Explore the data
```{r}
# get a high-level view of the dataset
# there are 517 observations and 13 variables
glimpse(ff)
summary(ff)
str(ff)

# check the status of missing value in the datasets
colSums(is.na(ff)) # the data is very clean as there are no missing values

# take a look at the distribution of predict variable (also our target variable: the burned area)
ggplot(ff, aes(x = area)) +geom_histogram()

# we can see from the graph that burn area is highly screwd with lots of 0 values
# log tranform the burn area to reduce skewness
ff$log_area = log10(ff$area + 1)

ggplot(ff, aes(x = log_area)) + geom_histogram()

# as we filtered out the burn areas below zero, the outcome variable appeared to be more normally distributed
ggplot(filter(ff, area >0), aes = (x = area + 1))
              + geom_histogram() 
              + scale_x_log10("Burn Area (log10)", breaks = c(1, 10, 100, 1000))

# explore the relationship between burn are and park location
burn_coord = ff %>% group_by(X, Y) %>% summarize(area_mean = mean(area))

ggplot(burn_coord, aes(x = factor(X), y = factor(Y),
       fill = area_mean)) + geom_tile() + scale_fill_gradient2()

# explore if forest fire has seasonality
# make the month in order
ff$month = factor(ff$month, levels = c("jan", "feb", "mar", "apr", "may", "jun", 
                                       "jul", "aug", "sep", "oct", "nov", "dec"))
ggplot(ff, aes(x = RH, y = area)) + geom_point() 
                                  + geom_smooth(method = "lm")
                                  + facet_wrap(~month) + scale_y_log10()

# using ggpairs to explore variables corelation
ggpairs(select(ff, temp, RH, wind, log_area))
```

When I explore the data
(1) It seems that weekends do cause a larger burn area
(2) Burn area seems to have a seasonally variation 
```{r}
# feature engineering
# plot the relationship between is_weekend and burn area
ff1 = filter(ff, log_area > 0)

ggplot(ff1, aes(x = is_weekend, y = log_area)) +
  geom_point() +
  geom_boxplot()

# plot the relationship between season and burn area
ggplot(ff1, aes(x = season, y = log_area)) +
  geom_point() +
  geom_boxplot()
# fires are more likely to happen on weekends because more visitors will come
# feature 1: whether the day is weekend
ff$is_weekend = ifelse(ff$day %in% c("sat", "sun"), 1, 0)

ff$is_weekend = factor(ff$is_weekend)

# check the burned area for different months
summary(ff$month)
length(ff$month)

# there are big variations between burn areas for each month
# feature 2: season
# I convert month to season
ff$season = 0
for (i in 1:length(ff$month)) {
  if (ff$month[i] %in% c("dec", "jan", "feb")) {
     ff$season[i] = "winter"  
 } else if (ff$month[i] %in% c("mar", "apr", "may")) {
     ff$season[i] = "spring" 
  }else if (ff$month[i] %in% c("jun", "jul", "aug")) {
     ff$season[i] = "summer"   
  }else  ff$season[i] = "autumn"
}

ff$season = as.factor(ff$season)

# convert month, day, and season into binary variables
month = model.matrix(~month - 1, data = ff)
day = model.matrix(~day - 1, data = ff)
season_binary = model.matrix(~season - 1, data = ff)

ff = cbind(ff, month, day, season_binary)
ff = select(ff, -month, -day, -area)

```

Use createDataPartition to split 80% of the forest fire data into a training set.
```{r}
in_train = createDataPartition(y = ff$log_area, p = 0.8, list = FALSE)
head(in_train)
is(in_train)

ff_train = ff[in_train, ]
ff_test = ff[-in_train, ]
is(ff_train)

dim(ff_train)
dim(ff_test)
```

Preprocess the data 
```{r}
# pre-process
# centering, scaling and remove near 0 value
nearZeroVar(ff, saveMetrics = TRUE)
ff_preprocess = preProcess(ff_train, method = c("center", "nzv"))

# use predict to apply them to the dataset
ff_train_proc = predict(ff_preprocess, ff_train)
ff_test_proc = predict(ff_preprocess, ff_test)

```

Develop a multivariate linear regression model 
```{r}
# Forward Method
# use forward selection by starting with no features and add them sequentially based on a model performance characteristic
forward_model = train(log_area ~ ., data =ff_train_proc,
                      method = "leapForward", tuneGrid = expand.grid(nvmax = 1:10), 
                      trControl = trainControl(method = "cv", number = 10))

plot(forward_model)
forward_model
# extract the final model
plot(forward_model$finalModel, scale = "adjr2")

```
Ridge Regression
Address the problem of parameter istability caused by highly correlated features
shrink the unstable coefficients by imposing an additional penalty parameter on the errors of an ordinary least squares regression
penalize coefficients that are either too big or too small
```{r}
ridge_model = train(log_area ~., data = ff_train_proc,
                    method = "ridge", tuneGrid = expand.grid(lambda = seq(0, 1, 0.05)),
                    trControl = trainControl(method = "cv", number = 10))

ridge_model
plot(ridge_model)

# extract the final model
plot(ridge_model$finalModel)

```

Lasso (Least Absolute Shrinkage and Selection Operator)
Address the problem of parameter istability caused by highly correlated features
Limit the sum of regression coefficient values
```{r}
lasso_model = train(log_area ~., data =ff_train_proc,
                    method = "lasso", tuneGrid = expand.grid(fraction = seq(0, 1, 0.5)),
                    trControl = trainControl(method = "cv", number = 10))
lasso_model

plot(lasso_model)

# extract the final model
plot(lasso_model$finalModel)

# get the model coefficients
lasso_coefs = predict(lasso_model$finalModel, type = "coef")
```
Model Comparison
```{r}
# compare dofferent models and select the final one to reoprt
compare = resamples(list(forward_selection = forward_model, 
                         ridge_selection = ridge_model, lasso_selection = lasso_model))

# compare RMSE and R-squared
summary(compare)

# plot the result
# Lasso model is the final model which has the lowest RMSE (Root Mean Square Error) of 0.6113552
dotplot(compare)

```