模型构建与分析
多元线性回归模型调用:lm(formula = Price ~ GSM + Material + Pattern + Opacity, data = reg_data)
回归结果:
解释结果:
-
(Intercept) 截距项估计值为43.847715,当GSM、遮光度为0,材质为纯聚酯纤维,模式为简约纹理时,预计价格为43.85美元
-
GSM估计值为0.243801,每增加1 GSM,价格预计平均增加约0.244美元
-
MaterialBlended 估计值为20.174001,使用混纺材质的产品相比纯Polyester,价格平均增加约20.17美元
-
PatternComplex 和Opacity 的显著性水平分别是0.7707,0.9668,均>0.05,对价格的影响不显著
源代码
library(dplyr)`
library(stringr)
library(car)
library(tidyverse)
library(stringr)
library(caret)
library(readxl)
file_path <- "C:\\Users\\PC-HMC\\Desktop\\123.xlsx"
data <- read_excel(file_path)
str(data) #查看数据结构
sum(is.na(data)) #检查是否有缺失值
# 1. 清洗Material:将纯Polyester标记为"P",混纺材质标记为"B"
data <- data %>%
mutate(Material = tolower(Material),
Material = ifelse(str_detect(Material, "polyester") & !str_detect(Material, "/|\\+|,"), "Pure_Polyester", "Blended"))
# 2. 清洗Pattern:将'Solid'标记为"Simple",其他标记为"Complex"
data <- data %>%
mutate(Pattern = ifelse(str_detect(Pattern, regex("solid", ignore_case = TRUE)), "Simple", "Complex"))
# 3. 清洗Opacity:提取数值,计算平均值
clean_opacity <- function(opacity) {
# 提取所有百分比数字
nums <- as.numeric(str_extract_all(opacity, "\\d+\\.?\\d*")[[1]])
if(length(nums) == 0){
return(NA)
} else {
return(mean(nums))
}
}
data$Opacity_Clean <- sapply(data$Opacity, clean_opacity)
# 4. 清洗GSM:提取数值,计算平均值
clean_gsm <- function(gsm) {
# 提取所有数字
nums <- as.numeric(str_extract_all(gsm, "\\d+\\.?\\d*")[[1]])
if(length(nums) == 0){
return(NA)
} else {
return(mean(nums))
}
}
data$GSM_Clean <- sapply(data$GSM, clean_gsm)
# 5. 处理Price中的范围(例如,Julian和Nathan有范围)
# 对于Price,我们将取平均值
clean_price <- function(price) {
# 检查是否有范围
if(grepl("-", price)) {
nums <- as.numeric(str_extract_all(price, "\\d+\\.?\\d*")[[1]])
return(mean(nums))
} else {
return(as.numeric(price))
}
}
# 在此数据集中,Price字段大部分是单一值,但有一些行可能包含范围
# 例如,Julian和Nathan的GSM包含范围,需要特别处理
# 但Price似乎没有范围,所以此步骤可能不需要
# 但为了稳妥,保留处理步骤
data$Price_Clean <- as.numeric(data$Price) # 当前数据中Price无范围
# 6. 处理GSM_Clean中的范围
# 已在GSM_Clean列中完成
# 查看清洗后的数据
head(data)
# 选择需要的列
reg_data <- data %>%
select(Price = Price_Clean, GSM = GSM_Clean, Material, Pattern, Opacity = Opacity_Clean) %>%
na.omit() # 移除缺失值
# 转换为因子变量
reg_data$Material <- factor(reg_data$Material, levels = c("Pure_Polyester", "Blended"))
reg_data$Pattern <- factor(reg_data$Pattern, levels = c("Simple", "Complex"))
# 建立回归模型
model <- lm(Price ~ GSM + Material + Pattern + Opacity, data = reg_data)
# 查看回归结果
summary(model)
# 可视化Pattern与Price的关系
library(ggplot2)
ggplot(reg_data, aes(x = Pattern, y = Price)) +
geom_boxplot() +
labs(title = "Price vs Pattern", x = "Pattern", y = "Price")
# 可视化Opacity与Price的关系
ggplot(reg_data, aes(x = Opacity, y = Price)) +
geom_point() +
geom_smooth(method = "loess") +
labs(title = "Price vs Opacity", x = "Opacity (%)", y = "Price")
# 查看异常值
outliers <- reg_data %>% filter(abs(std_resid) > 3)
print(outliers)
# 计算影响度量(Cook's distance)
cooksd <- cooks.distance(model)
# 识别高影响点
high_influence <- reg_data %>% filter(cooksd > (4/(nrow(reg_data) - length(model$coefficients) - 1)))
print(high_influence)