開始進入機器學習的領域,這真的是很有趣的部分,這次要透過基本的程式語言進行KNN的演算編輯,之後再來加入模組進行演算,看看會不會差太多😆,其實用到的函數不多,出來的結果也是很耐人尋味,數據集可以不斷增加,相似度的結果也會出現更多有趣的事情,本篇以年齡、收入做比對模擬可以推薦的手機品牌(Apple, Android),沒有任何的歧視意味,開始建立數據集時只是以直覺填入。
KNN需要的一些基本數學公式
- 點A到點B距離公式 : |AB| = √(x1 - x2)² + (y1 - y2)²
- 標準差 : R有基本函數可以用`sd()`
- 平均值 : 函數`mean()`
- 單位標準化 : 原始數值 - 平均值 / 標準差
- 相似度 : 函數`sqrt()`
數據集可以自己建立,小雷是建立一個.csv檔案,裡面可以透過其他欄位進行相似比對
完整的程式碼也會放在Github,有需要可以自行取用
小雷的Github >> 請點我
載入R包
library(tidyverse)
library(dplyr)
library(ggplot2)
載入數據位置與數據集
====================================================
# Phase1 : Import data
# 階段1 : 載入數據
# ====================================================
# 載入數據集位置
setwd("D:\\Github_version_file_R\\data_set\\marchine_learning_data\\KNN")
# Import learning data載入已知數據集
knn_df <- read.csv("machine_learn_R.csv")
knn_df
# 數據集內容
r$> knn_df
X name age_year income_million gender live_area buy_product date
1 1 Abe 25 102 male taipei apple 2021/12/20
2 2 Abraham 33 66 male taichung apple 2022/1/15
3 3 Ivan 53 53 male Tainan android 2022/1/23
4 4 Freda 65 70 female Hsinchu apple 2022/2/25
5 5 Geraldine 28 28 female Tainan android 2022/2/1
6 6 Grace 37 36 female Changhua android 2021/10/30
7 7 Joanne 44 41 female taipei android 2021/11/5
8 8 Maria 33 55 female Tainan apple 2021/11/13
9 9 Ozzie 49 68 male Hsinchu apple 2022/3/30
10 10 Pete 30 35 male taichung android 2022/3/1
11 11 Mary 47 44 male Hsinchu android 2022/3/25
12 12 Liv 35 55 male taipei apple 2022/3/12
13 13 Mark 45 77 female Tainan apple 2022/2/1
14 14 Jason 55 76 female Tainan apple 2022/1/1
15 15 Rose 35 80 female taichung apple 2022/1/3
16 16 Fra 55 30 male taichung android 2022/2/5
17 17 Nane 45 55 female Hsinchu apple 2021/12/15
18 18 Nami 34 26 male taichung android 2021/11/5
19 19 Chris 22 12 male taipei apple 2022/10/1
20 20 Bob 33 36 male Tainan android 2021/11/23
自定義想要模擬的預測數據
# ====================================================
# Phase2 : Create forecast data
# 階段2 : 創建需預測數據
# ====================================================
# Define forecast age、income 自定義年齡與收入
age <- as.numeric(readline(prompt = "請輸入年齡 : "))
income <- as.numeric(readline(prompt = "請輸入年收入/萬元 : "))
# Create forecast data建立需預測數據
predict_df <- tibble::tribble(
~name, ~age, ~income,
"Rex", age, income # 導入輸入的年齡與收入
)
# 自訂義輸入
r$> age <- as.numeric(readline(prompt = "請輸入年齡 : "))
income <- as.numeric(readline(prompt = "請輸入年收入/萬元 : "))
請輸入年齡 : 30
請輸入年收入/萬元 : 40
將原始數據集內容與想預測的數據圖擺再一起查看關聯
# ====================================================
# Phase3 : Correlation diagram
# 階段3 : 畫出關聯圖
# ====================================================
# Correlation diagram between forecast data and learning data
# 繪製測試圖表與預測數據的關聯性
ggplot(
data = knn_df, # 畫出原始的數據集圖
mapping = aes(
x = age_year,
y = income_million,
color = buy_product
)
) +
geom_point() +
geom_point(
x = predict_df$age, # 將測試的數據畫上去
y = predict_df$income,
color = "black",
size = 5
)
導入所有公式計算
# ====================================================
# Phase4 : Calculate the values required for KNN learning
# 階段4 : 計算出KNN機器學習所需數據
# ====================================================
# Calculate frature avg、sd 計算特徵的平均值與標準差
avg_age <- mean(knn_df$age_year) # age avg
sd_age <- sd(knn_df$age_year) # income sd
avg_income <- mean(knn_df$income_million) # income avg
sd_income <- sd(knn_df$income_million) # income sd
# Change col type 調整欄位型態
knn_df$age_year <- as.numeric(knn_df$age_year) # convert to num
knn_df$income_million <- as.numeric(knn_df$income_million) # convert to num
# Unit standardization and create knn_df cols knn_df進行單位標準化
knn_df <- knn_df %>%
mutate( # 平均年收減平均年齡除以年齡標準差,
# Average annual income minus average age divided by
# the standard deviation of age
age_stand = (knn_df$age_year - avg_age) / sd_age,
income_stand = (knn_df$income_million - avg_income) / sd_income
)
# simulation_df進行單位標準化
predict_df <- predict_df %>%
mutate(
age_stand = (predict_df$age - avg_age) / sd_age,
income_stand = (predict_df$income - avg_income) / sd_income
)
# Create similarity col knn_df增加相似度欄位
knn_df <- knn_df %>%
mutate(
similarity = (
sqrt((predict_df$age_stand - knn_df$age_stand)^2 +
(predict_df$income_stand - knn_df$income_stand)^2)
)
)
看一下目前結果
# 想要預測的
predict_df
# 結果輸出
r$> predict_df
# A tibble: 1 x 5
name age income age_stand income_stand
<chr> <dbl> <dbl> <dbl> <dbl>
1 Rex 30 40 -0.889 -0.545
# 原始數據集
knn_df
r$> knn_df
X name age_year income_million gender live_area buy_product date
1 1 Abe 25 102 male taipei apple 2021/12/20
2 2 Abraham 33 66 male taichung apple 2022/1/15
3 3 Ivan 53 53 male Tainan android 2022/1/23
4 4 Freda 65 70 female Hsinchu apple 2022/2/25
5 5 Geraldine 28 28 female Tainan android 2022/2/1
6 6 Grace 37 36 female Changhua android 2021/10/30
7 7 Joanne 44 41 female taipei android 2021/11/5
8 8 Maria 33 55 female Tainan apple 2021/11/13
9 9 Ozzie 49 68 male Hsinchu apple 2022/3/30
10 10 Pete 30 35 male taichung android 2022/3/1
11 11 Mary 47 44 male Hsinchu android 2022/3/25
12 12 Liv 35 55 male taipei apple 2022/3/12
13 13 Mark 45 77 female Tainan apple 2022/2/1
14 14 Jason 55 76 female Tainan apple 2022/1/1
15 15 Rose 35 80 female taichung apple 2022/1/3
16 16 Fra 55 30 male taichung android 2022/2/5
17 17 Nane 45 55 female Hsinchu apple 2021/12/15
18 18 Nami 34 26 male taichung android 2021/11/5
19 19 Chris 22 12 male taipei apple 2022/10/1
20 20 Bob 33 36 male Tainan android 2021/11/23
age_stand income_stand similarity
1 -1.3275207 2.21214628 2.7914426
2 -0.6265197 0.61139721 1.1856065
3 1.1259829 0.03334894 2.0966374
4 2.1774845 0.78925822 3.3444272
5 -1.0646454 -1.07828236 0.5616258
6 -0.2760192 -0.72256034 0.6386427
7 0.3373568 -0.50023408 1.2275574
8 -0.6265197 0.12227944 0.7169129
9 0.7754824 0.70032772 2.0789203
10 -0.8893951 -0.76702559 0.2223263
11 0.6002322 -0.36683833 1.5002079
12 -0.4512694 0.12227944 0.7980068
13 0.4249819 1.10051498 2.1057818
14 1.3012332 1.05604973 2.7131623
15 -0.4512694 1.23391074 1.8317772
16 1.3012332 -0.98935185 2.2353005
17 0.4249819 0.12227944 1.4739225
18 -0.5388946 -1.16721286 0.7144044
19 -1.5903961 -1.78972638 1.4288089
20 -0.6265197 -0.72256034 0.3173925
最後透過相似度將符合的數據抓出來並Print
# ====================================================
# Phase5 : Find most correlation predict product and print
# 階段5 : 找出最具相關性商品並列印
# ====================================================
# check數據
knn_df$age_stand
knn_df$income_stand
knn_df$similarity
predict_df
# 找出最接近預測數據之值
correlation_df <- min(knn_df$similarity)
# 輸出結果
r$> correlation_df
[1] 0.2223263
# 確定符合數值的數據位置
accord_loc <- which(knn_df$similarity == correlation_df)
# 輸出結果
r$> accord_loc
[1] 10
# 存入符合的數據列
predict_product <- knn_df[accord_loc:accord_loc, ]
# 輸出結果
r$> predict_product
X name age_year income_million gender live_area buy_product date
10 10 Pete 30 35 male taichung android 2022/3/1
age_stand income_stand similarity
10 -0.8893951 -0.7670256 0.2223263
# 印出預測商品
cat("依照年齡與年收入預測商品為 : ", predict_product$buy_product)
# 輸出結果
r$> # 印出預測商品
cat("依照年齡與年收入預測商品為 : ", predict_product$buy_product)
依照年齡與年收入預測商品為 : android
當然這樣做每次預測都要重跑一次,數據集大了跑的速度也會很慢,這只是以基本公式怎麼去寫出預測的演算方式,可以做個GUI是最好啦~若有任何錯誤之處,也請各位大大不吝指教!
圖片出處引用 : 由 Antti Ajanki AnAj - 自己的作品, CC BY-SA 3.0, https://commons.wikimedia.org/w/index.php?curid=2170282