Source code for cornac.datasets.goodreads


"""
This data is built based on the GoodReads dataset.  
"""
from ..utils import cache
from ..data import Reader
from typing import List

import numpy as np
import pandas as pd
from ..data import FeatureModality, SentimentModality
from ..eval_methods import RatioSplit


[docs] def load_feedback(fpath, fmt="UIR", sep=',', skip_lines=0, reader: Reader = None) -> List: """Load the user-item ratings, scale: [1,5] Parameters ---------- fpath: file path to xx-rating.txt reader: `obj:cornac.data.Reader`, default: None Reader object used to read the data. Returns ------- data: array-like Data in the form of a list of tuples (user, item, rating). """ reader = Reader() if reader is None else reader return reader.read(fpath, fmt=fmt, sep=sep, skip_lines=skip_lines)
[docs] def load_sentiment(reader: Reader = None) -> List: """Load the user-item-sentiments The dataset was constructed by the method described in the reference paper. Parameters ---------- reader: `obj:cornac.data.Reader`, default: None Reader object used to read the data. Returns ------- data: array-like Data in the form of a list of tuples (user, item, [(aspect, opinion, sentiment), (aspect, opinion, sentiment), ...]). References ---------- [1] Gao, J., Wang, X., Wang, Y., & Xie, X. (2019). Explainable Recommendation Through Attentive Multi-View Learning. AAAI. """ fpath_sentiment = cache(url='https://zenodo.org/records/11061007/files/goodreads_sentiment.txt?download=1') reader = Reader() if reader is None else reader return reader.read(fpath_sentiment, fmt='UITup', sep=',', tup_sep=':')
[docs] def prepare_data(data_name = "goodreads",test_size=0.2, dense=False, verbose=False, seed=42, item=True, user=False,sample_size=0.1): """Prepare data for the GoodReads dataset. Generate the data split for the dataset. Parameters ---------- data_name: str, default: 'goodreads' Name of the dataset to be prepared. Options: 'goodreads', 'goodreads_uir', 'goodreads_uir_1000', 'goodreads_limers' - 'goodreads': user-item-rating with sentiment data. - 'goodreads_uir': user-item-rating data in the whole dataset. - 'goodreads_uir_1000': user-item-rating data with 1000 samples. - 'goodreads_limers': user-item-rating data with item genres and user aspects. test_size: float, default: 0.2 The proportion of the dataset to include in the test split. dense: bool, default: False If True, use the dense version of the dataset. verbose: bool, default: False If True, print out messages. seed: int, default: 42 Random seed. item: bool, default: True If True, include item genres when preparing 'goodreads_limers'. user: bool, default: False If True, include user aspects when preparing 'goodreads_limers'. sample_size: float, default: 0.1 The proportion of the dataset to include in the split. Returns ------- rs: `obj:cornac.eval_methods.RatioSplit` The data split. """ # fpath_uir_dense = 'cornac/datasets/good_reads/good_read_dense.csv' fpath_uir_dense = cache(url='https://zenodo.org/records/11061007/files/good_read_dense.csv?download=1') sep_rating = ',' skip_lines = 0 if verbose: print('Preparing data...') if data_name == 'goodreads': if dense: fpath_rating = fpath_uir_dense sep_rating = '\t' skip_lines = 1 else: fpath_rating = cache(url='https://zenodo.org/records/11061007/files/goodreads_rating.txt?download=1') sentiment = load_sentiment() sentiment_modality = SentimentModality(data = sentiment) rating = load_feedback(fpath_rating, sep = sep_rating, skip_lines = skip_lines) indices = np.random.choice(len(rating), int(len(rating)*sample_size), replace=False) rating = np.array(rating)[indices] rs = RatioSplit(data=rating, test_size=test_size, exclude_unknowns=True, sentiment=sentiment_modality, verbose=verbose, seed=seed) elif data_name == 'goodreads_uir': fpath_uir = cache(url='https://zenodo.org/records/11061007/files/good_read_UIR_sample.csv?download=1') df = pd.read_csv(fpath_uir, sep='\t', header=0, names=['user_id', 'item_id', 'rating']) df = df.sample(frac=sample_size) data = df[['user_id', 'item_id', 'rating']].values rs = RatioSplit(data=data, test_size=test_size, verbose=verbose, seed=seed) elif data_name == 'goodreads_uir_1000': fpath_uir = cache(url='https://zenodo.org/records/11061007/files/good_read_UIR_1000.csv?download=1') if dense: fpath_uir = fpath_uir_dense df = pd.read_csv(fpath_uir, sep='\t', header=0, names=['user_id', 'item_id', 'rating']) df = df.sample(frac=sample_size) data = df[['user_id', 'item_id', 'rating']].values rs = RatioSplit(data=data, test_size=test_size, verbose=verbose, seed=seed) elif data_name == "goodreads_limers": fpath_uir = cache(url='https://zenodo.org/records/11061007/files/good_read_UIR_sample.csv?download=1') #fpath_uir = cache(url='https://zenodo.org/records/11061007/files/good_read_UIR_1000.csv?download=1') fpath_genres = cache(url='https://zenodo.org/records/11061007/files/goodreads_genres.csv?download=1') fpath_aspects = cache(url='https://zenodo.org/records/11061007/files/uid_aspect_features.txt?download=1') if dense: fpath_uir = fpath_uir_dense #df = pd.read_csv(fpath_uir, header=0, names=['user_id', 'item_id', 'rating']) df = pd.read_csv(fpath_uir, sep='\t', header=0, names=['user_id', 'item_id', 'rating']) if item==True: genres = pd.read_csv(fpath_genres) item_features = np.array([[x,y] for [x,y] in zip(genres['item_id'].to_numpy(), genres['feature'].to_numpy())]) df = df[df['item_id'].isin(genres['item_id'])] if user==True: user_aspects = pd.read_csv(fpath_aspects, sep='\t', usecols=['user_id', 'feature']) user_features = np.array([[x,y] for [x,y] in zip(user_aspects['user_id'].to_numpy(), user_aspects['feature'].to_numpy())]) df = df[df['user_id'].isin(user_aspects['user_id'])] df = df.sample(frac=sample_size) #df = pd.read_csv(fpath_rating, dtype={"user_id":str,"item_id":str}) data_triple = df[['user_id', 'item_id', 'rating']].values if item==True and user==True: rs = RatioSplit(data=data_triple, seed=seed, item_feature = FeatureModality(item_features), user_feature = FeatureModality(user_features), test_size=test_size, exclude_unknowns=True) elif item==True: rs = RatioSplit(data=data_triple, seed=seed, item_feature = FeatureModality(item_features), test_size=test_size, exclude_unknowns=True) else: rs = RatioSplit(data=data_triple, seed=seed, user_feature = FeatureModality(user_features), test_size=test_size, exclude_unknowns=True) else: print(f'No dataset named {data_name}') return None if verbose: print('Data prepared.') return rs