Quickstart

import pandas as pd

from parquetranger import TableRepo
df = pd.DataFrame(
    {
        "A": [1, 2, 3, 4, 5, 6],
        "B": ["x", "y", "z", "x1", "x2", "x3"],
        "C": [1, 2, 1, 1, 1, 2],
        "C2": ["a", "a", "b", "a", "c", "c"],
    },
    index=["a1", "a2", "a3", "a4", "a5", "a6"],
)
df
A B C C2
a1 1 x 1 a
a2 2 y 2 a
a3 3 z 1 b
a4 4 x1 1 a
a5 5 x2 1 c
a6 6 x3 2 c
trepo = TableRepo("some_tmp_path", group_cols="C2")  # this creates the directory
trepo.extend(df)
trepo.get_full_df()
A B C C2
a1 1 x 1 a
a2 2 y 2 a
a4 4 x1 1 a
a3 3 z 1 b
a5 5 x2 1 c
a6 6 x3 2 c
df2 = pd.DataFrame(
    {
        "A": [21, 22, 23],
        "B": ["X", "Y", "Z"],
        "C": [10,20,1],
        "C2": ["a", "b", "a"],
    },
    index=["a1", "a4", "a7"]
    )
trepo.replace_records(df2)  # replaces based on index
trepo.get_full_df()
A B C C2
a2 2 y 2 a
a1 21 X 10 a
a7 23 Z 1 a
a3 3 z 1 b
a4 22 Y 20 b
a5 5 x2 1 c
a6 6 x3 2 c
trepo.replace_groups(df2)
trepo.get_full_df()  # replaced the whole groups where C2==a and C2==b with the records that were present in df2
A B C C2
a1 21 X 10 a
a7 23 Z 1 a
a4 22 Y 20 b
a5 5 x2 1 c
a6 6 x3 2 c
trepo.replace_all(df2)  # erases everything and puts df2 in. all traces of df are lost
trepo.get_full_df()
A B C C2
a1 21 X 10 a
a7 23 Z 1 a
a4 22 Y 20 b
trepo.replace_records(df, by_groups=True)  # replaces records based on index, but only looks for indices within groups, so this way duplicate a4 index is possible
# as they are in different groups, with different values in C2
trepo.get_full_df()
A B C C2
a7 23 Z 1 a
a1 1 x 1 a
a2 2 y 2 a
a4 4 x1 1 a
a4 22 Y 20 b
a3 3 z 1 b
a5 5 x2 1 c
a6 6 x3 2 c
trepo.purge()  # deletes everything