#include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include "ppport.h" #include "const-c.inc" #include //=================================================================== // // Constants and Type Declarations // //=================================================================== // #define MAX_FEATURES 64 // Number of features to use #define MAX_FEATURES 8 // #define MIN_EPOCHS 120 // Minimum number of epochs per feature #define MIN_EPOCHS 500 // #define MAX_EPOCHS 200 // Max epochs per feature #define MAX_EPOCHS 2000 #define MIN_IMPROVEMENT 0.005 // Minimum improvement required to continue current feature #define INITx 0.1 // Initialization value for features #define LRATE 0.001 // Learning rate parameter #define K 0.015 // Regularization parameter used to minimize over-fitting ////////////////////////////////////////////////////////////////////////////////////////////////////// typedef unsigned char BYTE; #define true 1 #define false 0 struct Movie { int RatingCount; int RatingSum; double RatingAvg; double PseudoAvg; }; // PseudoAvg is a weighted average used to deal with small movie counts struct Customer { int RatingCount; int RatingSum; }; // int CustomerId; struct Data { int CustId; short MovieId; BYTE Rating; float Cache; }; // class Engine int num_customers; int num_movies; int num_ratings; // private: int RatingCount; // Current number of loaded ratings struct Data *Ratings; // Array of ratings data struct Movie *Movies; // Array of movie metrics struct Customer *Customers; // Array of customer metrics float **MovieFeatures; // Array of features by movie (using floats to save space) float **CustFeatures; // Array of features by customer (using floats to save space) // was private, should be public now void Engine(int, int, int); void DestroyEngine(); void set_Movies(int, int, int); void set_Ratings(int, int, int); void CalcMetrics(); void CalcFeatures(); inline double PredictRating(short, int); inline double PredictRating2(short, int, int, float, bool); // end class Engine // sdw accessors for Perl void set_Movies(int movieId, int count, int sum) { Movies[movieId].RatingCount = count; Movies[movieId].RatingSum = sum; } void set_Ratings(int movieId, int custId, int rating) { Ratings[RatingCount].MovieId = (short)movieId; Ratings[RatingCount].CustId = custId; Ratings[RatingCount].Rating = (BYTE)rating; Ratings[RatingCount].Cache = 0; RatingCount++; } //=================================================================== // // Engine Class // //=================================================================== //------------------------------------------------------------------- // Initialization //------------------------------------------------------------------- void Engine(int x_num_customers, int x_num_ratings, int x_num_movies) { int i, f; num_customers = x_num_customers; // yeah, I know... there's a trick for doing this. don't know it. XXX todo. num_ratings = x_num_ratings; num_movies = x_num_movies; RatingCount = 0; Ratings = (struct Data*)calloc(sizeof(struct Data), num_ratings); Movies = (struct Movie*)calloc(sizeof(struct Movie), num_movies); Customers = (struct Customer*)calloc(sizeof(struct Customer), num_customers); MovieFeatures = (float **)calloc(sizeof(float *), MAX_FEATURES); // calloc should have these 0 initialized CustFeatures = (float **)calloc(sizeof(float *), MAX_FEATURES); for (f=0; fMovieId].RatingCount++; Movies[rating->MovieId].RatingSum += rating->Rating; // killed map translate stuff here -- sdw Customers[rating->CustId].RatingCount++; Customers[rating->CustId].RatingSum += rating->Rating; } // Do a follow-up loop to calc movie averages for (i=0; iRatingAvg = movie->RatingSum / (1.0 * movie->RatingCount); movie->PseudoAvg = (3.23 * 25 + movie->RatingSum) / (25.0 + movie->RatingCount); } } // // CalcFeatures // - Iteratively train each feature on the entire data set // - Once sufficient progress has been made, move on // void CalcFeatures() { int f, e, i, custId, cnt = 0; struct Data* rating; double err, p, sq, rmse_last, rmse = 2.0; short movieId; float cf, mf; for (f=0; fMovieId; custId = rating->CustId; // Predict rating and calc error p = PredictRating2(movieId, custId, f, rating->Cache, true); // wprintf(L" movieId=%d custId=%d rating=%d predicted=%1.2f finalpredicted=%1.2f\n", movieId, custId, rating->Rating, p, PredictRating(movieId, custId)); err = (1.0 * rating->Rating - p); sq += err*err; // Cache off old feature values cf = CustFeatures[f][custId]; mf = MovieFeatures[f][movieId]; // Cross-train the features CustFeatures[f][custId] += (float)(LRATE * (err * mf - K * cf)); MovieFeatures[f][movieId] += (float)(LRATE * (err * cf - K * mf)); } rmse = sqrt(sq/RatingCount); // wprintf(L" cnt='%d' rmse='%f' />\n",cnt,rmse); } // Cache off old predictions for (i=0; iCache = (float)PredictRating2(rating->MovieId, rating->CustId, f, rating->Cache, false); } } } // // PredictRating // - During training there is no need to loop through all of the features // - Use a cache for the leading features and do a quick calculation for the trailing // - The trailing can be optionally removed when calculating a new cache value // double PredictRating2(short movieId, int custId, int feature, float cache, bool bTrailing) { // Get cached value for old features or default to an average double sum = (cache > 0) ? cache : 1; //Movies[movieId].PseudoAvg; // Add contribution of current feature sum += MovieFeatures[feature][movieId] * CustFeatures[feature][custId]; if (sum > 5) sum = 5; if (sum < 1) sum = 1; // Add up trailing defaults values if (bTrailing) // if (bTrailing && feature > 0) // sdw -- no way to test this without more complicated data so leaving it alone for now { sum += (MAX_FEATURES-feature-1) * (INITx * INITx); // sdw -- this is bizarre; wrap around from the other side minus one? // sum += (feature-1) * (INITx * INITx); // sdw -- wouldn't this be more correct? meh? if (sum > 5) sum = 5; if (sum < 1) sum = 1; } return sum; } // // PredictRating // - This version is used for calculating the final results // - It loops through the entire list of finished features // double PredictRating(short movieId, int custId) { double sum = 1; //Movies[movieId].PseudoAvg; int f; for (f=0; f 5) sum = 5; if (sum < 1) sum = 1; return sum; } /* ############################################################################### # # SVD Sample Code # # Copyright (C) 2007 Timely Development (www.timelydevelopment.com) # # Special thanks to Simon Funk and others from the Netflix Prize contest # for providing pseudo-code and tuning hints. # # Feel free to use this code as you wish as long as you include # these notices and attribution. # # Also, if you have alternative types of algorithms for accomplishing # the same goal and would like to contribute, please share them as well :) # # STANDARD DISCLAIMER: # # - THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY # - OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT # - LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND#OR # - FITNESS FOR A PARTICULAR PURPOSE. # ############################################################################### Copyright © 2008 by Timely Development, LLC. */ MODULE = Math::Preference::SVD PACKAGE = Math::Preference::SVD INCLUDE: const-xs.inc PROTOTYPES: DISABLE void set_Movies (movieId, count, sum) int movieId int count int sum PREINIT: I32* temp; PPCODE: temp = PL_markstack_ptr++; set_Movies(movieId, count, sum); if (PL_markstack_ptr != temp) { /* truly void, because dXSARGS not invoked */ PL_markstack_ptr = temp; XSRETURN_EMPTY; /* return empty stack */ } /* must have used dXSARGS; list context implied */ return; /* assume stack size is correct */ void set_Ratings (movieId, custId, rating) int movieId int custId int rating PREINIT: I32* temp; PPCODE: temp = PL_markstack_ptr++; set_Ratings(movieId, custId, rating); if (PL_markstack_ptr != temp) { /* truly void, because dXSARGS not invoked */ PL_markstack_ptr = temp; XSRETURN_EMPTY; /* return empty stack */ } /* must have used dXSARGS; list context implied */ return; /* assume stack size is correct */ void Engine (x_num_customers, x_num_ratings, x_num_movies) int x_num_customers int x_num_ratings int x_num_movies PREINIT: I32* temp; PPCODE: temp = PL_markstack_ptr++; Engine(x_num_customers, x_num_ratings, x_num_movies); if (PL_markstack_ptr != temp) { /* truly void, because dXSARGS not invoked */ PL_markstack_ptr = temp; XSRETURN_EMPTY; /* return empty stack */ } /* must have used dXSARGS; list context implied */ return; /* assume stack size is correct */ void DestroyEngine () PREINIT: I32* temp; PPCODE: temp = PL_markstack_ptr++; DestroyEngine(); if (PL_markstack_ptr != temp) { /* truly void, because dXSARGS not invoked */ PL_markstack_ptr = temp; XSRETURN_EMPTY; /* return empty stack */ } /* must have used dXSARGS; list context implied */ return; /* assume stack size is correct */ void CalcMetrics () PREINIT: I32* temp; PPCODE: temp = PL_markstack_ptr++; CalcMetrics(); if (PL_markstack_ptr != temp) { /* truly void, because dXSARGS not invoked */ PL_markstack_ptr = temp; XSRETURN_EMPTY; /* return empty stack */ } /* must have used dXSARGS; list context implied */ return; /* assume stack size is correct */ void CalcFeatures () PREINIT: I32* temp; PPCODE: temp = PL_markstack_ptr++; CalcFeatures(); if (PL_markstack_ptr != temp) { /* truly void, because dXSARGS not invoked */ PL_markstack_ptr = temp; XSRETURN_EMPTY; /* return empty stack */ } /* must have used dXSARGS; list context implied */ return; /* assume stack size is correct */ double PredictRating2 (movieId, custId, feature, cache, bTrailing) short movieId int custId int feature float cache bool bTrailing double PredictRating (movieId, custId) short movieId int custId