Quickstart
import pandas as pd
from parquetranger import TableRepo
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6],
"B": ["x", "y", "z", "x1", "x2", "x3"],
"C": [1, 2, 1, 1, 1, 2],
"C2": ["a", "a", "b", "a", "c", "c"],
},
index=["a1", "a2", "a3", "a4", "a5", "a6"],
)
df
| A | B | C | C2 | |
|---|---|---|---|---|
| a1 | 1 | x | 1 | a |
| a2 | 2 | y | 2 | a |
| a3 | 3 | z | 1 | b |
| a4 | 4 | x1 | 1 | a |
| a5 | 5 | x2 | 1 | c |
| a6 | 6 | x3 | 2 | c |
trepo = TableRepo("some_tmp_path", group_cols="C2") # this creates the directory
trepo.extend(df)
trepo.get_full_df()
| A | B | C | C2 | |
|---|---|---|---|---|
| a1 | 1 | x | 1 | a |
| a2 | 2 | y | 2 | a |
| a4 | 4 | x1 | 1 | a |
| a3 | 3 | z | 1 | b |
| a5 | 5 | x2 | 1 | c |
| a6 | 6 | x3 | 2 | c |
df2 = pd.DataFrame(
{
"A": [21, 22, 23],
"B": ["X", "Y", "Z"],
"C": [10,20,1],
"C2": ["a", "b", "a"],
},
index=["a1", "a4", "a7"]
)
trepo.replace_records(df2) # replaces based on index
trepo.get_full_df()
| A | B | C | C2 | |
|---|---|---|---|---|
| a2 | 2 | y | 2 | a |
| a1 | 21 | X | 10 | a |
| a7 | 23 | Z | 1 | a |
| a3 | 3 | z | 1 | b |
| a4 | 22 | Y | 20 | b |
| a5 | 5 | x2 | 1 | c |
| a6 | 6 | x3 | 2 | c |
trepo.replace_groups(df2)
trepo.get_full_df() # replaced the whole groups where C2==a and C2==b with the records that were present in df2
| A | B | C | C2 | |
|---|---|---|---|---|
| a1 | 21 | X | 10 | a |
| a7 | 23 | Z | 1 | a |
| a4 | 22 | Y | 20 | b |
| a5 | 5 | x2 | 1 | c |
| a6 | 6 | x3 | 2 | c |
trepo.replace_all(df2) # erases everything and puts df2 in. all traces of df are lost
trepo.get_full_df()
| A | B | C | C2 | |
|---|---|---|---|---|
| a1 | 21 | X | 10 | a |
| a7 | 23 | Z | 1 | a |
| a4 | 22 | Y | 20 | b |
trepo.replace_records(df, by_groups=True) # replaces records based on index, but only looks for indices within groups, so this way duplicate a4 index is possible
# as they are in different groups, with different values in C2
trepo.get_full_df()
| A | B | C | C2 | |
|---|---|---|---|---|
| a7 | 23 | Z | 1 | a |
| a1 | 1 | x | 1 | a |
| a2 | 2 | y | 2 | a |
| a4 | 4 | x1 | 1 | a |
| a4 | 22 | Y | 20 | b |
| a3 | 3 | z | 1 | b |
| a5 | 5 | x2 | 1 | c |
| a6 | 6 | x3 | 2 | c |
trepo.purge() # deletes everything