软件公司产品营销大数据分析(下)

机器学习训练营(qq群号:696721295)—— 机器学习案例详解的直播互动平台
下期直播案例预告:大数据预测商品的销售量波动趋势

分析数据

现在,让我们以问题的形式探索产品销售量情况,并从数据里找到答案。

问题一:数据集里有多少家商店?

dataset_sales %>% select(shop_id) %>% distinct() %>% count()

软件公司产品营销大数据分析(下)

问题二:哪家商店最受欢迎?整体销售量是多少?

most.popular.shop <- 
  dataset_sales %>% group_by(shop_id) %>% 
  summarise(total.sales.by.shop = sum(item_cnt_day)) %>% 
  arrange(desc(total.sales.by.shop)) %>% ungroup()

ggplot(data = most.popular.shop, aes(x = reorder(as.factor(shop_id), total.sales.by.shop),
                                     y = total.sales.by.shop, fill = as.factor(shop_id))) +
  geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most popular shop with most sales", x = "Shop(s)", y = "Total sales",
       fill = "Shop Id")

rm(most.popular.shop)

软件公司产品营销大数据分析(下)

问题三:在所有的商店里有多少件商品?

dataset_sales %>% select(item_id) %>% distinct() %>% count()

软件公司产品营销大数据分析(下)

问题四:在所有的商品里有多少个类别?

dataset_sales %>% select(item_category_id) %>% distinct() %>% count()

软件公司产品营销大数据分析(下)

问题五:哪家商店的商品最多?

most.items.in.shop <- dataset_sales %>% group_by(shop_id) %>%
  summarise(total.items.per.shop = n_distinct(item_id)) %>%
  arrange(desc(total.items.per.shop)) %>% ungroup()

ggplot(data = most.items.in.shop, aes(x = reorder(as.factor(shop_id), total.items.per.shop),
                                      y = total.items.per.shop, fill = as.factor(shop_id))) +
  geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most items available at shop(s)", x = "Shop(s)", y = "Total number of items at shop",
       fill = "Shop Id")

rm(most.items.in.shop)

软件公司产品营销大数据分析(下)

问题六:在每一家商店,哪件商品最受欢迎?卖的最好?

most.sold.item.at.shop <- dataset_sales %>% group_by(shop_id, item_id) %>%
  summarise(most.sold.item.count = sum(item_cnt_day)) %>%
  filter(most.sold.item.count == max(most.sold.item.count)) %>%
  arrange(desc(most.sold.item.count)) %>% ungroup()

ggplot(data = most.sold.item.at.shop, aes(x = reorder(as.factor(shop_id), most.sold.item.count),
                                          y = most.sold.item.count, fill = as.factor(shop_id))) +
  
  geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most popular/sold item at shop(s)", x = "Shop(s)", y = "Most sold item at shop",
       fill = "Shop Id")           
              
rm(most.sold.item.at.shop)

软件公司产品营销大数据分析(下)

问题七:哪家商店的类别最多?

most.categories.in.shop <- dataset_sales %>% group_by(shop_id) %>%
  summarise(total.categories.per.shop = n_distinct(item_category_id)) %>%
  arrange(desc(total.categories.per.shop)) %>% ungroup()

ggplot(data = most.categories.in.shop, aes(x = reorder(as.factor(shop_id), total.categories.per.shop), 
                                           y = total.categories.per.shop, fill = as.factor(shop_id))) +
  geom_bar(stat = "identity") + coord_flip() +
  labs(title = "Most categories item at shop(s)", x = "Shop(s)", y = "Total item categories at shop",
       fill = "Shop Id")
              
rm(most.categories.in.shop)

软件公司产品营销大数据分析(下)

问题八:列出每家商店销售的全部产品类别。

product.categories.sold.by.shop <- dataset_sales %>% group_by(shop_id) %>%
  summarise(product.categories.list = paste(sort(unique(item_category_id)), 
                                            collapse = ',')) %>% ungroup()
product.categories.sold.by.shop
rm(product.categories.sold.by.shop)

软件公司产品营销大数据分析(下)

问题九:按月份统计当月每日的整体销售量。

month.day.wise.total.sales <- dataset_sales %>% group_by(month, day) %>% 
  summarise(total.sales.everyday = sum(item_price * item_cnt_day)) %>%
  arrange(month, day) %>% ungroup()
ggplot(data = month.day.wise.total.sales, aes(x = day, y = total.sales.everyday,
                                              group = month, color = as.factor(month))) +
  geom_line() +
  geom_point() +
  scale_x_continuous(breaks = seq(min(0), max(31), by = 1)) +
  labs(title = "Month-Day-wise total sales", x = "Day(s) of Month",
       y = "Total sales everyday", fill = "Month")

软件公司产品营销大数据分析(下)

问题十:按月份统计售出商品的百分比。

dataset_sales$sale_price <- dataset_sales$item_price * dataset_sales$item_cnt_day
total.no.of.items.sold <- sum(dataset_sales$item_cnt_day)
total.revenue <- sum(dataset_sales$sale_price)

monthly.items.sales <- dataset_sales %>% group_by(date_block_num) %>%
  summarise(monthly.items.sales.frequency = round(sum(item_cnt_day) / total.no.of.items.sold,
                                                  digit = 3))

ggplot(data = monthly.items.sales, aes(x = "", y = monthly.items.sales.frequency,
                                       fill = as.factor(date_block_num))) +
  geom_bar(width = 1, stat = "identity") +
  coord_polar(theta = "y", start = 0) +
  geom_col(position = 'fill') +
  geom_label(aes(label = paste0(monthly.items.sales.frequency * 100, "%")),
             position = position_fill(vjust = 0.5)) +
  labs(title = "% of items sold per month", x = "", y = "Monthly item sale frequency",
       fill = "Months")
rm(total.no.of.items.sold, total.revenue, monthly.items.sales)

软件公司产品营销大数据分析(下)