Simple Approach to Recommender Systems
#importing necessary libraries
import pandas as pd
import numpy as np
These datasets are hosted on: https://archive.ics.uci.edu/ml/datasets/Restaurant+%26+consumer+data
They were originally published by: Blanca Vargas-Govea, Juan Gabriel González-Serna, Rafael Ponce-MedellÃn. Effects of relevant contextual features in the performance of a restaurant recommender system. In RecSys’11: Workshop on Context Aware Recommender Systems (CARS-2011), Chicago, IL, USA, October 23, 2011.
#loading rating and cuisine data into dataframes
frame = pd.read_csv('rating_final.csv')
cuisine = pd.read_csv('chefmozcuisine.csv')
|
userID |
placeID |
rating |
food_rating |
service_rating |
0 |
U1077 |
135085 |
2 |
2 |
2 |
1 |
U1077 |
135038 |
2 |
2 |
1 |
2 |
U1077 |
132825 |
2 |
2 |
2 |
3 |
U1077 |
135060 |
1 |
2 |
2 |
4 |
U1068 |
135104 |
1 |
1 |
2 |
|
placeID |
Rcuisine |
0 |
135110 |
Spanish |
1 |
135109 |
Italian |
2 |
135107 |
Latin_American |
3 |
135106 |
Mexican |
4 |
135105 |
Fast_Food |
Recommending based on counts
#A very simple recommendation based on how many times a place has been rated by users
rating_count = pd.DataFrame(frame.groupby('placeID')['rating'].count())
rating_count.sort_values('rating', ascending=False).head()
|
rating |
placeID |
|
135085 |
36 |
132825 |
32 |
135032 |
28 |
135052 |
25 |
132834 |
25 |
most_rated_places = pd.DataFrame([135085, 132825, 135032, 135052, 132834], index=np.arange(5), columns=['placeID'])
#Creating a new dataframe with most rated places and the respective cuisines served
summary = pd.merge(most_rated_places, cuisine, on='placeID')
summary
|
placeID |
Rcuisine |
0 |
135085 |
Fast_Food |
1 |
132825 |
Mexican |
2 |
135032 |
Cafeteria |
3 |
135032 |
Contemporary |
4 |
135052 |
Bar |
5 |
135052 |
Bar_Pub_Brewery |
6 |
132834 |
Mexican |
#Evaluating the most preffered cuisine type from the cuisine dataset
cuisine['Rcuisine'].describe()
count 916
unique 59
top Mexican
freq 239
Name: Rcuisine, dtype: object
Simple Approaches to Recommender Systems
Making Recommendations Based on Correlation
#Loading the geographical data into geodataset
geodata = pd.read_csv('geoplaces2.csv', encoding = 'mbcs')
|
userID |
placeID |
rating |
food_rating |
service_rating |
0 |
U1077 |
135085 |
2 |
2 |
2 |
1 |
U1077 |
135038 |
2 |
2 |
1 |
2 |
U1077 |
132825 |
2 |
2 |
2 |
3 |
U1077 |
135060 |
1 |
2 |
2 |
4 |
U1068 |
135104 |
1 |
1 |
2 |
|
placeID |
latitude |
longitude |
the_geom_meter |
name |
address |
city |
state |
country |
fax |
... |
alcohol |
smoking_area |
dress_code |
accessibility |
price |
url |
Rambience |
franchise |
area |
other_services |
0 |
134999 |
18.915421 |
-99.184871 |
0101000020957F000088568DE356715AC138C0A525FC46... |
Kiku Cuernavaca |
Revolucion |
Cuernavaca |
Morelos |
Mexico |
? |
... |
No_Alcohol_Served |
none |
informal |
no_accessibility |
medium |
kikucuernavaca.com.mx |
familiar |
f |
closed |
none |
1 |
132825 |
22.147392 |
-100.983092 |
0101000020957F00001AD016568C4858C1243261274BA5... |
puesto de tacos |
esquina santos degollado y leon guzman |
s.l.p. |
s.l.p. |
mexico |
? |
... |
No_Alcohol_Served |
none |
informal |
completely |
low |
? |
familiar |
f |
open |
none |
2 |
135106 |
22.149709 |
-100.976093 |
0101000020957F0000649D6F21634858C119AE9BF528A3... |
El Rincón de San Francisco |
Universidad 169 |
San Luis Potosi |
San Luis Potosi |
Mexico |
? |
... |
Wine-Beer |
only at bar |
informal |
partially |
medium |
? |
familiar |
f |
open |
none |
3 |
132667 |
23.752697 |
-99.163359 |
0101000020957F00005D67BCDDED8157C1222A2DC8D84D... |
little pizza Emilio Portes Gil |
calle emilio portes gil |
victoria |
tamaulipas |
? |
? |
... |
No_Alcohol_Served |
none |
informal |
completely |
low |
? |
familiar |
t |
closed |
none |
4 |
132613 |
23.752903 |
-99.165076 |
0101000020957F00008EBA2D06DC8157C194E03B7B504E... |
carnitas_mata |
lic. Emilio portes gil |
victoria |
Tamaulipas |
Mexico |
? |
... |
No_Alcohol_Served |
permitted |
informal |
completely |
medium |
? |
familiar |
t |
closed |
none |
5 rows × 21 columns
places = geodata[['placeID', 'name']]
places.head()
|
placeID |
name |
0 |
134999 |
Kiku Cuernavaca |
1 |
132825 |
puesto de tacos |
2 |
135106 |
El Rincón de San Francisco |
3 |
132667 |
little pizza Emilio Portes Gil |
4 |
132613 |
carnitas_mata |
|
placeID |
Rcuisine |
0 |
135110 |
Spanish |
1 |
135109 |
Italian |
2 |
135107 |
Latin_American |
3 |
135106 |
Mexican |
4 |
135105 |
Fast_Food |
Grouping and Ranking Data
rating = pd.DataFrame(frame.groupby('placeID')['rating'].mean())
rating.head()
|
rating |
placeID |
|
132560 |
0.50 |
132561 |
0.75 |
132564 |
1.25 |
132572 |
1.00 |
132583 |
1.00 |
rating['rating_count'] = pd.DataFrame(frame.groupby('placeID')['rating'].count())
rating.head()
|
rating |
rating_count |
placeID |
|
|
132560 |
0.50 |
4 |
132561 |
0.75 |
4 |
132564 |
1.25 |
4 |
132572 |
1.00 |
15 |
132583 |
1.00 |
4 |
|
rating |
rating_count |
count |
130.000000 |
130.000000 |
mean |
1.179622 |
8.930769 |
std |
0.349354 |
6.124279 |
min |
0.250000 |
3.000000 |
25% |
1.000000 |
5.000000 |
50% |
1.181818 |
7.000000 |
75% |
1.400000 |
11.000000 |
max |
2.000000 |
36.000000 |
rating.sort_values('rating_count', ascending=False).head()
|
rating |
rating_count |
placeID |
|
|
135085 |
1.333333 |
36 |
132825 |
1.281250 |
32 |
135032 |
1.178571 |
28 |
135052 |
1.280000 |
25 |
132834 |
1.000000 |
25 |
#Taking the highest rated place to find similar places
places[places['placeID']==135085]
|
placeID |
name |
121 |
135085 |
Tortas Locas Hipocampo |
cuisine[cuisine['placeID']==135085]
|
placeID |
Rcuisine |
44 |
135085 |
Fast_Food |
#create a spreadsheet-style pivot table as a DataFrame
places_crosstab = pd.pivot_table(data=frame, values='rating', index='userID', columns='placeID')
places_crosstab.head()
placeID |
132560 |
132561 |
132564 |
132572 |
132583 |
132584 |
132594 |
132608 |
132609 |
132613 |
... |
135080 |
135081 |
135082 |
135085 |
135086 |
135088 |
135104 |
135106 |
135108 |
135109 |
userID |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
U1001 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
0.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
U1002 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
1.0 |
NaN |
NaN |
NaN |
1.0 |
NaN |
NaN |
U1003 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
2.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
U1004 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2.0 |
NaN |
NaN |
U1005 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
5 rows × 130 columns
Tortas_ratings = places_crosstab[135085].notnull()
Tortas_ratings[Tortas_ratings>=0]
userID
U1001 True
U1002 True
U1003 False
U1004 False
U1005 False
...
U1134 True
U1135 True
U1136 False
U1137 True
U1138 False
Name: 135085, Length: 138, dtype: bool
#Pairwise correlation is computed between rows or columns of DataFrame with rows or columns of Series or DataFrame.
similar_to_Tortas = places_crosstab.corrwith(Tortas_ratings)
corr_Tortas = pd.DataFrame(similar_to_Tortas, columns=['PearsonR'])
corr_Tortas.dropna(inplace=True)
corr_Tortas.head()
|
PearsonR |
placeID |
|
132572 |
-0.211289 |
132723 |
0.092057 |
132754 |
0.159152 |
132755 |
0.250000 |
132825 |
0.222473 |
Tortas_corr_summary = corr_Tortas.join(rating['rating_count'])
#Summarizing similar places to Tortas Locas Hipocampo based on PearsonR values
Tortas_corr_summary[Tortas_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(10)
|
PearsonR |
rating_count |
placeID |
|
|
132951 |
0.845154 |
10 |
135030 |
0.478091 |
12 |
135075 |
0.470317 |
13 |
135064 |
0.445285 |
17 |
135079 |
0.362166 |
17 |
135081 |
0.361449 |
11 |
135062 |
0.307941 |
21 |
135069 |
0.301511 |
12 |
135057 |
0.294174 |
15 |
132825 |
0.222473 |
32 |
places_corr_Tortas = pd.DataFrame([135085, 132754, 135045, 135062, 135028, 135042, 135046], index = np.arange(7), columns=['placeID'])
summary = pd.merge(places_corr_Tortas, cuisine,on='placeID')
summary
|
placeID |
Rcuisine |
0 |
135085 |
Fast_Food |
1 |
132754 |
Mexican |
2 |
135028 |
Mexican |
3 |
135042 |
Chinese |
4 |
135046 |
Fast_Food |
places[places['placeID']==135046]
|
placeID |
name |
42 |
135046 |
Restaurante El Reyecito |