R Regression: Logistic Regression: Difference between revisions
From OnnoCenterWiki
Jump to navigationJump to search
Onnowpurbo (talk | contribs) Created page with " # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ ==Referensi== * http://www.sthda.com/englis..." |
Onnowpurbo (talk | contribs) No edit summary |
||
| Line 1: | Line 1: | ||
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ | # Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/ | ||
install.packages("mlbench") | |||
# Load the data set PIMA INDIAN | |||
data("PimaIndiansDiabetes2", package = "mlbench") | |||
# Inspect the data | |||
head(PimaIndiansDiabetes2, 4) | |||
library(tidyverse) | |||
library(caret) | |||
theme_set(theme_bw()) | |||
# PREPARE DATA | |||
# Load the data and remove NAs | |||
data("PimaIndiansDiabetes2", package = "mlbench") | |||
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2) | |||
# Inspect the data | |||
sample_n(PimaIndiansDiabetes2, 3) | |||
# Split the data into training and test set | |||
set.seed(123) | |||
training.samples <- PimaIndiansDiabetes2$diabetes %>% | |||
createDataPartition(p = 0.8, list = FALSE) | |||
train.data <- PimaIndiansDiabetes2[training.samples, ] | |||
test.data <- PimaIndiansDiabetes2[-training.samples, ] | |||
# Fit the model | |||
# The R function glm(), for generalized linear model, can be used to compute logistic regression. | |||
# You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. | |||
model <- glm( diabetes ~., data = train.data, family = binomial) | |||
# Summarize the model | |||
summary(model) | |||
# Make predictions | |||
probabilities <- model %>% predict(test.data, type = "response") | |||
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | |||
# Model accuracy | |||
mean(predicted.classes == test.data$diabetes) | |||
# Simole Logistic Regression | |||
model <- glm( diabetes ~ glucose, data = train.data, family = binomial) | |||
summary(model)$coef | |||
# Prediction | |||
newdata <- data.frame(glucose = c(20, 180)) | |||
probabilities <- model %>% predict(newdata, type = "response") | |||
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | |||
predicted.classes | |||
# PLOT Prediction Function | |||
train.data %>% | |||
mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>% | |||
ggplot(aes(glucose, prob)) + | |||
geom_point(alpha = 0.2) + | |||
geom_smooth(method = "glm", method.args = list(family = "binomial")) + | |||
labs( | |||
title = "Logistic Regression Model", | |||
x = "Plasma Glucose Concentration", | |||
y = "Probability of being diabete-pos" | |||
) | |||
# | |||
# Multiple Logistic Regression | |||
# | |||
model <- glm( diabetes ~ glucose + mass + pregnant, | |||
data = train.data, family = binomial) | |||
summary(model)$coef | |||
# Model Summary | |||
model <- glm( diabetes ~., data = train.data, family = binomial) | |||
summary(model)$coef | |||
# alternate show | |||
coef(model) | |||
summary(model )$coef | |||
# UBAH MODEL | |||
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree, | |||
data = train.data, family = binomial) | |||
# Predict | |||
probabilities <- model %>% predict(test.data, type = "response") | |||
head(probabilities) | |||
# check class | |||
contrasts(test.data$diabetes) | |||
# Predict the class of individuals: | |||
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") | |||
head(predicted.classes) | |||
# Assessing model accuracy | |||
mean(predicted.classes == test.data$diabetes) | |||
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)), | |||
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package: | |||
library("mgcv") | |||
# Fit the model | |||
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant, | |||
data = train.data, family = "binomial") | |||
# Summarize model | |||
summary(gam.model ) | |||
# Make predictions | |||
probabilities <- gam.model %>% predict(test.data, type = "response") | |||
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg") | |||
# Model Accuracy | |||
mean(predicted.classes == test.data$diabetes) | |||
Latest revision as of 09:07, 29 November 2019
# Ref: http://www.sthda.com/english/articles/36-classification-methods-essentials/151-logistic-regression-essentials-in-r/
install.packages("mlbench")
# Load the data set PIMA INDIAN
data("PimaIndiansDiabetes2", package = "mlbench")
# Inspect the data
head(PimaIndiansDiabetes2, 4)
library(tidyverse) library(caret) theme_set(theme_bw())
# PREPARE DATA
# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]
# Fit the model # The R function glm(), for generalized linear model, can be used to compute logistic regression. # You need to specify the option family = binomial, which tells to R that we want to fit logistic regression. model <- glm( diabetes ~., data = train.data, family = binomial) # Summarize the model summary(model) # Make predictions probabilities <- model %>% predict(test.data, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") # Model accuracy mean(predicted.classes == test.data$diabetes)
# Simole Logistic Regression model <- glm( diabetes ~ glucose, data = train.data, family = binomial) summary(model)$coef
# Prediction newdata <- data.frame(glucose = c(20, 180)) probabilities <- model %>% predict(newdata, type = "response") predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") predicted.classes
# PLOT Prediction Function
train.data %>%
mutate(prob = ifelse(diabetes == "pos", 1, 0)) %>%
ggplot(aes(glucose, prob)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
labs(
title = "Logistic Regression Model",
x = "Plasma Glucose Concentration",
y = "Probability of being diabete-pos"
)
#
# Multiple Logistic Regression
#
model <- glm( diabetes ~ glucose + mass + pregnant,
data = train.data, family = binomial)
summary(model)$coef
# Model Summary model <- glm( diabetes ~., data = train.data, family = binomial) summary(model)$coef # alternate show coef(model) summary(model )$coef
# UBAH MODEL
model <- glm( diabetes ~ pregnant + glucose + pressure + mass + pedigree,
data = train.data, family = binomial)
# Predict probabilities <- model %>% predict(test.data, type = "response") head(probabilities) # check class contrasts(test.data$diabetes)
# Predict the class of individuals: predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg") head(predicted.classes)
# Assessing model accuracy mean(predicted.classes == test.data$diabetes)
# You can also fit generalized additive models (Chapter @ref(polynomial-and-spline-regression)),
# when linearity of the predictor cannot be assumed. This can be done using the mgcv package:
library("mgcv")
# Fit the model
gam.model <- gam(diabetes ~ s(glucose) + mass + pregnant,
data = train.data, family = "binomial")
# Summarize model
summary(gam.model )
# Make predictions
probabilities <- gam.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities> 0.5, "pos", "neg")
# Model Accuracy
mean(predicted.classes == test.data$diabetes)