Source code for hidi.inout
import numpy as np
import pandas as pd
from hidi.transform import Transform
[docs]class ReadTransform(Transform):
"""
Read input csv data from disk.
Input data should be a csv file formatted with three
columns: :code:`link_id`, :code:`item_id`, and
:code:`score`. If score is not provided, it we be
defaulted to one. :code:`link_id` represents to the
"user" and `item_id` represents the "item" in the context
of traditional collaborative filtering.
:param infiles:
Array of paths to csv documents to be loaded
and concatenated into one DataFrame. Each csv
document must have a :code:`link_id` and a
:code:`item_id` column. An optional
:code:`score` column may also be supplied.
:type infiles: array
"""
def __init__(self, infiles, **kwargs):
self._inputs = infiles
def _normalize(self, df):
if 'score' not in df.columns:
df['score'] = np.ones(df.shape[0])
return df[['link_id', 'item_id', 'score']]
[docs] def transform(self, **kwargs):
"""
Read in files from the :code:`infiles` array given
upon instantiation.
:rtype: pandas.DataFrame
"""
dfs = [pd.read_csv(inp) for inp in self._inputs]
dfs = [self._normalize(df) for df in dfs]
return pd.concat(dfs), kwargs
[docs]class WriteTransform(Transform):
"""
Write output to disk in csv or json formats.
:param outfile: A string that is a path to the desired
output on the file system.
:type outfile: str
:param file_format: A string that is a file extension,
either :code:`json` or :code:`csv`.
:type file_format: str
"""
def __init__(self, outfile, file_format='csv',
enc=None, link_key='link_id'):
self.outfile = outfile
self.file_format = file_format
self.link_key = link_key
self.encoding = enc
[docs] def transform(self, df, **kwargs):
"""
Write a DataFrame to a file.
:param df: The Pandas DataFrame to be written to a
file
:type df: pandas.DataFrame
:rtype: pandas.DataFrame
"""
if self.file_format == 'csv':
df.to_csv(self.outfile, encoding=self.encoding)
else:
with open(self.outfile, 'w+') as f:
import json
for row in df.iterrows():
f.write(json.dumps({
self.link_key: row[0],
'factors': row[1].tolist()
}))
f.write('\n')
return df, kwargs