patternpythonMinor
Similarity research : K-Nearest Neighbour(KNN) using a linear regression to determine the weights
Viewed 0 times
lineartheresearchnearestweightsregressionknnusingneighbourdetermine
Problem
I have a set of houses with categorical and numerical data. Later I will have a new house and my goal will be to find the 20 closest houses.
The code is working fine, and the result are not so bad but it is way too long. With a sample of 10 000 houses it takes 6 minutes, using Python 2.7. My actual dataset is about 100 000 houses.
```
# -- coding: utf-8 --
import csv
import random
import math
import operator
import numpy as np
import pandas as pd, os
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from unidecode import unidecode
import sys
# The function I use to deal with categorical data
def one_hot_dataframe(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return (data, vecData, vec)
# New data set with numerical data only
data_encode, _, _= one_hot_dataframe(data, ['Type', 'Bord_de_mer', 'Bord_de_plan_deau', 'Campagne', 'Centre_ville', 'Complexe_de_vacances', 'Lac', 'Montagne', 'Plage', 'Riviere', 'Village', 'Ville', 'Acces_haut_debit', 'Acces_internet', 'Climatisation', 'Linge_de_maison_fourni', 'Serviettes_de_bain', 'Wifi', 'Terrasse', 'Veranda_Loggia', 'Balcon', 'Jardin', 'location_non_fumeur', 'fumeurs_acceptes', 'animaux_autorises', 'animaux_non_admis', 'acces_handicape', 'Piscine_commune', 'Piscine_privee', 'Piscine_dinterieur', 'Piscine_chauffee', 'Bain_a_remous', 'Sauna'], replace=True)
#convert string to float
data = data.convert_objects(convert_numeric=True)
# deleting an useless column
data = data_encode.drop('SizeIn',1)
# I create a column for the index
data.
The code is working fine, and the result are not so bad but it is way too long. With a sample of 10 000 houses it takes 6 minutes, using Python 2.7. My actual dataset is about 100 000 houses.
```
# -- coding: utf-8 --
import csv
import random
import math
import operator
import numpy as np
import pandas as pd, os
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from unidecode import unidecode
import sys
# The function I use to deal with categorical data
def one_hot_dataframe(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return (data, vecData, vec)
# New data set with numerical data only
data_encode, _, _= one_hot_dataframe(data, ['Type', 'Bord_de_mer', 'Bord_de_plan_deau', 'Campagne', 'Centre_ville', 'Complexe_de_vacances', 'Lac', 'Montagne', 'Plage', 'Riviere', 'Village', 'Ville', 'Acces_haut_debit', 'Acces_internet', 'Climatisation', 'Linge_de_maison_fourni', 'Serviettes_de_bain', 'Wifi', 'Terrasse', 'Veranda_Loggia', 'Balcon', 'Jardin', 'location_non_fumeur', 'fumeurs_acceptes', 'animaux_autorises', 'animaux_non_admis', 'acces_handicape', 'Piscine_commune', 'Piscine_privee', 'Piscine_dinterieur', 'Piscine_chauffee', 'Bain_a_remous', 'Sauna'], replace=True)
#convert string to float
data = data.convert_objects(convert_numeric=True)
# deleting an useless column
data = data_encode.drop('SizeIn',1)
# I create a column for the index
data.
Solution
To my eye, what sticks out is:
because if you inline euclideanDistance, it becomes:
so your alg is order n^2 over the dataset. See if you can convert this into some
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet.iloc[x,:], 3, len(testInstance))
ncol.append(dist)because if you inline euclideanDistance, it becomes:
for x in range(len(trainingSet)):
dist = 0
instance1, instance2 = testInstance, trainingSet.iloc[x,:]
for xx in range(3, len(testInstance)):
dist += (1/abs(weights.iloc[xx-3,1]))*pow((instance1[xx] - instance2[xx]), 2)
ncol.append(math.sqrt(dist))so your alg is order n^2 over the dataset. See if you can convert this into some
DataFrame operations - they're (I think) backed by numpy so are vectorized for better efficiency.Code Snippets
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet.iloc[x,:], 3, len(testInstance))
ncol.append(dist)for x in range(len(trainingSet)):
dist = 0
instance1, instance2 = testInstance, trainingSet.iloc[x,:]
for xx in range(3, len(testInstance)):
dist += (1/abs(weights.iloc[xx-3,1]))*pow((instance1[xx] - instance2[xx]), 2)
ncol.append(math.sqrt(dist))Context
StackExchange Code Review Q#136096, answer score: 2
Revisions (0)
No revisions yet.