In this example, the MovieLens dataset is used. The column separator was changed to tab.
The file u_ordered.data
is ordered by timestamp (this is not necessary but allows implementations that take advantage of temporal effects, e. g., concept drift). The file rfr.csv
has a sample of the the users in u_ordered.data
. Both files are attached.
Code Block | ||
---|---|---|
| ||
#DEFINE rating_data_input_file ${PROJECTPATH}/ml-100k/u_ordered.data
#DEFINE rfr_data_input_file ${PROJECTPATH}/ml-100k/rfr.csv
#PARSER PQL
#RUNQUERY
// A data stream of ratings.
rating_data := ACCESS({source='rating_data', wrapper='GenericPull', transport='File', protocol='CSV', datahandler='Tuple',
options=[
['Delimiter', '\t'],
['filename', '${rating_data_input_file}']],
schema=[
['user','Integer'], // some learners need Long instead of Integer
['item','Integer'], // some learners need Long instead of Integer
['rating','Double'],
['timestamp','StartTimeStamp']
]
})
#RUNQUERY
// A data stream of request for recommendations of users.
rfr := TIMEWINDOW({size = 1}, ACCESS({source='rfr', wrapper='GenericPull', transport='File', protocol='CSV', datahandler='Tuple',
options=[
['Delimiter', '\t'],
['filename', '${rfr_data_input_file}']],
schema=[
['user','Integer'], // some learners need Long instead of Integer
['timestamp','StartTimeStamp']
]
}))
#QNAME RecommenderSystem
#QUERY
/// split learning and test data
splitted_rating_data = EXTRACT_TEST_DATA({strategy = 'ITTT'}, rating_data)
/// continuous learning
windowed_learning_data = TIMEWINDOW({size = [30, 'days']}, 0:splitted_rating_data)
models = TRAIN_RECSYS_MODEL({learner = 'BRISMF.MOA'}, windowed_learning_data)
/// recommending
recomm_candidates = RECOMMENDATION_CANDIDATES(JOIN(rfr, 1:models))
predicted_candidates = PREDICT_RATING(JOIN(models, recomm_candidates))
recommendations = RECOMMEND({top_n = 8, min_rating = 3.5}, predicted_candidates)
/// evaluation
predicted_test_data = PREDICT_RATING(JOIN(models, 1:splitted_rating_data))
model_errors = TEST_PREDICTION({aggregation_window_size = [24, 'hours']}, predicted_test_data)
|