Source code for hidi.inout

import numpy as np
import pandas as pd

from hidi.transform import Transform

[docs]class ReadTransform(Transform): """ Read input csv data from disk. Input data should be a csv file formatted with three columns: :code:`link_id`, :code:`item_id`, and :code:`score`. If score is not provided, it we be defaulted to one. :code:`link_id` represents to the "user" and `item_id` represents the "item" in the context of traditional collaborative filtering. :param infiles: Array of paths to csv documents to be loaded and concatenated into one DataFrame. Each csv document must have a :code:`link_id` and a :code:`item_id` column. An optional :code:`score` column may also be supplied. :type infiles: array """ def __init__(self, infiles, **kwargs): self._inputs = infiles def _normalize(self, df): if 'score' not in df.columns: df['score'] = np.ones(df.shape[0]) return df[['link_id', 'item_id', 'score']]
[docs] def transform(self, **kwargs): """ Read in files from the :code:`infiles` array given upon instantiation. :rtype: pandas.DataFrame """ dfs = [pd.read_csv(inp) for inp in self._inputs] dfs = [self._normalize(df) for df in dfs] return pd.concat(dfs), kwargs
[docs]class WriteTransform(Transform): """ Write output to disk in csv or json formats. :param outfile: A string that is a path to the desired output on the file system. :type outfile: str :param file_format: A string that is a file extension, either :code:`json` or :code:`csv`. :type file_format: str """ def __init__(self, outfile, file_format='csv', enc=None, link_key='link_id'): self.outfile = outfile self.file_format = file_format self.link_key = link_key self.encoding = enc
[docs] def transform(self, df, **kwargs): """ Write a DataFrame to a file. :param df: The Pandas DataFrame to be written to a file :type df: pandas.DataFrame :rtype: pandas.DataFrame """ if self.file_format == 'csv': df.to_csv(self.outfile, encoding=self.encoding) else: with open(self.outfile, 'w+') as f: import json for row in df.iterrows(): f.write(json.dumps({ self.link_key: row[0], 'factors': row[1].tolist() })) f.write('\n') return df, kwargs