Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

There are two operations to realize a Recommender System with Odysseus:

Example how to use the operators

In this example, the MovieLens dataset is used. The column separator was changed to tab.

The file u_ordered.data is ordered by timestamp (this is not necessary but allows implementations to that take advantage of temporal effects, e. g., concept drift).The file unique_temporal_ordered_users.data has only the user column of  The file rfr.csv has a sample of the the users in u_ordered.data. Duplicates Both files are removedattached.

Code Block
linenumberstrue
#PARSER CQL#DEFINE rating_data_input_file ${PROJECTPATH}/ml-100k/u_ordered.data
#DEFINE rfr_data_input_file ${PROJECTPATH}/ml-100k/rfr.csv


#PARSER PQL


#RUNQUERY
CREATE STREAM ml100k (userid Integer, itemid Integer, rating Double, timestamp Long)
   WRAPPER 'GenericPull'
   PROTOCOL 'CSV'
   TRANSPORT 'File'
   DATAHANDLER 'Tuple'
   OPTIONS (
      'filename' '${PROJECTPATH}/datasets/ml-100k/u_ordered.data',
      'delimiter' '\t'
      ,'scheduler.delay' '100'
   )

#RUNQUERY
CREATE STREAM ml100k_users (userid Integer)
   WRAPPER 'GenericPull'
   PROTOCOL 'CSV'
   TRANSPORT 'File'
   DATAHANDLER 'Tuple'
   OPTIONS (
      'filename' '${PROJECTPATH}/datasets/ml-100k/unique_temporal_ordered_users.data',
      'delimiter' '\t'
      ,'scheduler.delay' '1000'
   )

#PARSER PQL

#ADDQUERY
recommendationModels = RECOMMENDATION_LEARN(
   {
      item = 'itemid',
      user = 'userid',
      rating = 'rating',
      learner = 'Mahout',
      options = [
         'OptionRecommender'='SVDRecommender',
         'OptionFactorizer'='SVDPlusPlusFactorizer'
      ]
   },
   ml100k)

#ADDQUERY
recommendations = RECOMMENDATION(
   {
      recommender = 'recommender',
      user = 'userid',
      no_of_recommendations = 5
   },
   ml100k_users,
   recommendationModels)rating_data := ACCESS({source='rating_data', wrapper='GenericPull', transport='File', protocol='CSV', datahandler='Tuple', options=[['Delimiter', '\t'],['filename', '${rating_data_input_file}']],schema=[['user','Integer'],['item','Integer'],['rating','Double'],['timestamp','StartTimeStamp']]})
 
#RUNQUERY
rfr := TIMEWINDOW({size = 1}, ACCESS({source='rfr', wrapper='GenericPull', transport='File', protocol='CSV', datahandler='Tuple', options=[['Delimiter', '\t'], ['filename', '${rfr_data_input_file}']], schema=[['user','Integer'], ['timestamp','StartTimeStamp']] } ) )


#QNAME RecommenderSystem
#QUERY
/// split learning and test data
splitted_rating_data = EXTRACT_TEST_DATA({strategy = 'ITTT'}, rating_data)

/// continuous learning
windowed_learning_data = TIMEWINDOW({size = [30, 'days']}, 0:splitted_rating_data)
models = TRAIN_RECSYS_MODEL({learner = 'BRISMF.MOA'}, windowed_learning_data)

/// recommending
recomm_candidates = RECOMMENDATION_CANDIDATES(JOIN(rfr, 1:models))
predicted_candidates = PREDICT_RATING(JOIN(models, recomm_candidates))
recommendations = RECOMMEND({top_n = 8, min_rating = 3.5}, predicted_candidates)

/// evaluation
predicted_test_data = PREDICT_RATING(JOIN(models, 1:splitted_rating_data))
model_errors = TEST_PREDICTION({aggregation_window_size = [24, 'hours']},
               predicted_test_data)