-
Notifications
You must be signed in to change notification settings - Fork 3
/
inputDatLibFM_converter.py
73 lines (57 loc) · 2.85 KB
/
inputDatLibFM_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import numpy as np
import os
# The whole goal of this file is to now convert our data into the format that is used for
# ratings.dat in the movielens dataset which is:
# UserID::MovieID::Rating::Timestamp
# once we do this, we can use a pearl script built into libFM to convert these ratings
# and then make predictions
# http://files.grouplens.org/datasets/movielens/ml-10m-README.html
# https://github.com/srendle/libfm
# http://www.libfm.org/libfm-1.42.manual.pdf
# prepend to the file
# https://www.quora.com/How-can-I-write-text-in-the-first-line-of-an-existing-file-using-Python
#def prependEntries(fileName, string1, string2):
# with open(fileName, 'r+') as f:
# file_data = f.read()
# f.seek(0,0)
# f.write(string1.rstrip('\r\n') + '\n' + string2.rstrip('\r\n') + '\n' + file_data)
# wrote this function knowing what was inside of our files with our columns
def fileToMatrixMarket_MU(fileName, label):
# MU data
print('Loading data', label,'mu...')
df = pd.read_csv(os.path.join('data', fileName))
# modify data fram to get rid of data we're not using
del df['Unnamed: 0']
#del df['Date Number']
def df['bin'] # from our bin stuff
df = df.astype('int32')
# move the ratings to the last column, and date number to the third column
cols = df.columns.tolist() # user, movie, date, rating -> user, movie, rating, date
cols = cols[:2] + list(cols[3]) + list(cols[2]) # rearrange columsn
df = df[cols]
# assume that our first read in file has the number of users and movies
#if maxUsers == -1:
# maxUsers = df['User Number'].max()
# maxMovies = df['Movie Number'].max()
#numRatings = df.shape[0]
#print('maxUsers:', maxUsers, 'maxMovies:', maxMovies, 'numRatings', numRatings)
# append 0: to the 2nd column, 1: to the 3rd, because data fomat is:
# y 0:x1 1:x2 ...
newFileName = label + '_libFM'
newFileLocation = "libfm/" + newFileName
print('Making new file', newFileName)
df.to_csv(newFileLocation, sep='::', index=False, header=False)
print('Finished reading in data')
# need this for the first row for the Matrix Market Exchange Format
#rowsColsEntries = str(maxUsers) + ' ' + str(maxMovies) + ' ' + str(numRatings)
#print('num Users, num Movies, num ratings', rowsColsEntries)
#fileHeader = '%%MatrixMarket matrix coordinate real general'
#prependEntries(newFileLocation, fileHeader, rowsColsEntries)
print('file processing done for', label, 'new file created', newFileLocation, '\n')
fileToMatrixMarket_MU('mu_train.csv', 'mu_train')
fileToMatrixMarket_MU('mu_val.csv', 'mu_val')
fileToMatrixMarket_MU('mu_probe.csv', 'mu_probe')
fileToMatrixMarket_MU('mu_qual.csv', 'mu_qual')
fileToMatrixMarket_MU('mu_qual_val.csv', 'mu_qual_val')
fileToMatrixMarket_MU('mu_qual_probe.csv', 'mu_qual_probe')