小雷的 Programming & Analytic 日誌: R Machine Learning Predict Product - KNN

開始進入機器學習的領域，這真的是很有趣的部分，這次要透過基本的程式語言進行KNN的演算編輯，之後再來加入模組進行演算，看看會不會差太多😆，其實用到的函數不多，出來的結果也是很耐人尋味，數據集可以不斷增加，相似度的結果也會出現更多有趣的事情，本篇以年齡、收入做比對模擬可以推薦的手機品牌(Apple, Android)，沒有任何的歧視意味，開始建立數據集時只是以直覺填入。

KNN需要的一些基本數學公式

點A到點B距離公式 : |AB| = √(x1 - x2)² + (y1 - y2)²
標準差 : R有基本函數可以用`sd()`
平均值 : 函數`mean()`
單位標準化 : 原始數值 - 平均值 / 標準差
相似度 : 函數`sqrt()`

數據集可以自己建立，小雷是建立一個.csv檔案，裡面可以透過其他欄位進行相似比對

完整的程式碼也會放在Github，有需要可以自行取用

小雷的Github >> 請點我

載入R包

library(tidyverse)
library(dplyr)
library(ggplot2)

載入數據位置與數據集

 ====================================================

# Phase1 : Import data

# 階段1 : 載入數據

# ====================================================

# 載入數據集位置

setwd("D:\\Github_version_file_R\\data_set\\marchine_learning_data\\KNN")

# Import learning data載入已知數據集

knn_df <- read.csv("machine_learn_R.csv")

knn_df

# 數據集內容

自定義想要模擬的預測數據

# ====================================================

# Phase2 : Create forecast data

# 階段2 : 創建需預測數據

# ====================================================

# Define forecast age、income 自定義年齡與收入

age <- as.numeric(readline(prompt = "請輸入年齡 : "))

income <- as.numeric(readline(prompt = "請輸入年收入/萬元 : "))

# Create forecast data建立需預測數據

predict_df <- tibble::tribble(

    ~name, ~age, ~income,

    "Rex", age, income # 導入輸入的年齡與收入

)

# 自訂義輸入

r$> age <- as.numeric(readline(prompt = "請輸入年齡 : ")) income <- as.numeric(readline(prompt = "請輸入年收入/萬元 : ")) 請輸入年齡 : 30 請輸入年收入/萬元 : 40

將原始數據集內容與想預測的數據圖擺再一起查看關聯

# ====================================================
# Phase3 : Correlation diagram
# 階段3 : 畫出關聯圖
# ====================================================
# Correlation diagram between forecast data and learning data
# 繪製測試圖表與預測數據的關聯性
ggplot(
    data = knn_df, # 畫出原始的數據集圖
    mapping = aes(
        x = age_year,
        y = income_million,
        color = buy_product
    )
) +
    geom_point() +
    geom_point(
        x = predict_df$age, # 將測試的數據畫上去
        y = predict_df$income,
        color = "black",
        size = 5
    )

導入所有公式計算

# ====================================================
# Phase4 : Calculate the values required for KNN learning
# 階段4 : 計算出KNN機器學習所需數據
# ====================================================
# Calculate frature avg、sd 計算特徵的平均值與標準差
avg_age <- mean(knn_df$age_year) # age avg
sd_age <- sd(knn_df$age_year) # income sd
avg_income <- mean(knn_df$income_million) # income avg
sd_income <- sd(knn_df$income_million) # income sd

# Change col type 調整欄位型態
knn_df$age_year <- as.numeric(knn_df$age_year) # convert to num
knn_df$income_million <- as.numeric(knn_df$income_million) # convert to num

# Unit standardization and create knn_df cols  knn_df進行單位標準化

knn_df <- knn_df %>%
    mutate( # 平均年收減平均年齡除以年齡標準差，
    # Average annual income minus average age divided by 
    # the standard deviation of age
        age_stand = (knn_df$age_year - avg_age) / sd_age,
        income_stand = (knn_df$income_million - avg_income) / sd_income
    )

# simulation_df進行單位標準化
predict_df <- predict_df %>%
    mutate(
        age_stand = (predict_df$age - avg_age) / sd_age,
        income_stand = (predict_df$income - avg_income) / sd_income
    )

# Create similarity col knn_df增加相似度欄位
knn_df <- knn_df %>%
    mutate(
        similarity = (
            sqrt((predict_df$age_stand - knn_df$age_stand)^2 + 
                (predict_df$income_stand - knn_df$income_stand)^2)
            )
    )

看一下目前結果

# 想要預測的

predict_df

# 結果輸出

r$> predict_df # A tibble: 1 x 5 name age income age_stand income_stand <chr> <dbl> <dbl> <dbl> <dbl> 1 Rex 30 40 -0.889 -0.545

# 原始數據集

knn_df

r$> knn_df X name age_year income_million gender live_area buy_product date 1 1 Abe 25 102 male taipei apple 2021/12/20 2 2 Abraham 33 66 male taichung apple 2022/1/15 3 3 Ivan 53 53 male Tainan android 2022/1/23 4 4 Freda 65 70 female Hsinchu apple 2022/2/25 5 5 Geraldine 28 28 female Tainan android 2022/2/1 6 6 Grace 37 36 female Changhua android 2021/10/30 7 7 Joanne 44 41 female taipei android 2021/11/5 8 8 Maria 33 55 female Tainan apple 2021/11/13 9 9 Ozzie 49 68 male Hsinchu apple 2022/3/30 10 10 Pete 30 35 male taichung android 2022/3/1 11 11 Mary 47 44 male Hsinchu android 2022/3/25 12 12 Liv 35 55 male taipei apple 2022/3/12 13 13 Mark 45 77 female Tainan apple 2022/2/1 14 14 Jason 55 76 female Tainan apple 2022/1/1 15 15 Rose 35 80 female taichung apple 2022/1/3 16 16 Fra 55 30 male taichung android 2022/2/5 17 17 Nane 45 55 female Hsinchu apple 2021/12/15 18 18 Nami 34 26 male taichung android 2021/11/5 19 19 Chris 22 12 male taipei apple 2022/10/1 20 20 Bob 33 36 male Tainan android 2021/11/23 age_stand income_stand similarity 1 -1.3275207 2.21214628 2.7914426 2 -0.6265197 0.61139721 1.1856065 3 1.1259829 0.03334894 2.0966374 4 2.1774845 0.78925822 3.3444272 5 -1.0646454 -1.07828236 0.5616258 6 -0.2760192 -0.72256034 0.6386427 7 0.3373568 -0.50023408 1.2275574 8 -0.6265197 0.12227944 0.7169129 9 0.7754824 0.70032772 2.0789203 10 -0.8893951 -0.76702559 0.2223263 11 0.6002322 -0.36683833 1.5002079 12 -0.4512694 0.12227944 0.7980068 13 0.4249819 1.10051498 2.1057818 14 1.3012332 1.05604973 2.7131623 15 -0.4512694 1.23391074 1.8317772 16 1.3012332 -0.98935185 2.2353005 17 0.4249819 0.12227944 1.4739225 18 -0.5388946 -1.16721286 0.7144044 19 -1.5903961 -1.78972638 1.4288089 20 -0.6265197 -0.72256034 0.3173925

最後透過相似度將符合的數據抓出來並Print

# ====================================================

# Phase5 : Find most correlation predict product and print

# 階段5 : 找出最具相關性商品並列印

# ====================================================

# check數據

knn_df$age_stand

knn_df$income_stand

knn_df$similarity

predict_df

# 找出最接近預測數據之值

correlation_df <- min(knn_df$similarity)

# 輸出結果

r$> correlation_df [1] 0.2223263

# 確定符合數值的數據位置

accord_loc <- which(knn_df$similarity == correlation_df)

# 輸出結果

r$> accord_loc [1] 10

# 存入符合的數據列

predict_product <- knn_df[accord_loc:accord_loc, ]

# 輸出結果

r$> predict_product X name age_year income_million gender live_area buy_product date 10 10 Pete 30 35 male taichung android 2022/3/1 age_stand income_stand similarity 10 -0.8893951 -0.7670256 0.2223263

# 印出預測商品

cat("依照年齡與年收入預測商品為 : ", predict_product$buy_product)

# 輸出結果

r$> # 印出預測商品 cat("依照年齡與年收入預測商品為 : ", predict_product$buy_product) 依照年齡與年收入預測商品為 : android

當然這樣做每次預測都要重跑一次，數據集大了跑的速度也會很慢，這只是以基本公式怎麼去寫出預測的演算方式，可以做個GUI是最好啦~若有任何錯誤之處，也請各位大大不吝指教!

圖片出處引用 : 由 Antti Ajanki AnAj - 自己的作品, CC BY-SA 3.0, https://commons.wikimedia.org/w/index.php?curid=2170282

小雷的 Programming & Analytic 日誌

搜尋感興趣的網誌

所有文章連結

2022年4月1日星期五

R Machine Learning Predict Product - KNN | R 機器學習預測商品 - KNN

沒有留言:

張貼留言

其他文章

看看精選文章

納希克房價分析 | Nashik Apartment Price Analyze – 語法解析(上)

標籤

檢舉濫用情形