Source code for hidi.clean
from hidi.transform import Transform
[docs]class DedupeTransform(Transform):
"""
Deduplicate link-item tall skinny DataFrame
"""
def __init__(self, skip_dedupe=False):
self.skip_dedupe = skip_dedupe
[docs] def transform(self, df, **kwargs):
"""
Takes a :code:`df` that has :code:`link_id` and :code:`item_id`
columns, and deduplicates them so that each pair is represented
at most once.
:param df: The dataframe to dedupe
:type df: pandas.DataFrame
:rtype: pandas.DataFrame
"""
if self.skip_dedupe:
return df, kwargs
return df.drop_duplicates(subset=['link_id', 'item_id']), kwargs