Quickstart ========== .. code:: ipython3 import pandas as pd from parquetranger import TableRepo .. code:: ipython3 df = pd.DataFrame( { "A": [1, 2, 3, 4, 5, 6], "B": ["x", "y", "z", "x1", "x2", "x3"], "C": [1, 2, 1, 1, 1, 2], "C2": ["a", "a", "b", "a", "c", "c"], }, index=["a1", "a2", "a3", "a4", "a5", "a6"], ) .. code:: ipython3 df .. raw:: html
A B C C2
a1 1 x 1 a
a2 2 y 2 a
a3 3 z 1 b
a4 4 x1 1 a
a5 5 x2 1 c
a6 6 x3 2 c
.. code:: ipython3 trepo = TableRepo("some_tmp_path", group_cols="C2") # this creates the directory .. code:: ipython3 trepo.extend(df) .. code:: ipython3 trepo.get_full_df() .. raw:: html
A B C C2
a1 1 x 1 a
a2 2 y 2 a
a4 4 x1 1 a
a3 3 z 1 b
a5 5 x2 1 c
a6 6 x3 2 c
.. code:: ipython3 df2 = pd.DataFrame( { "A": [21, 22, 23], "B": ["X", "Y", "Z"], "C": [10,20,1], "C2": ["a", "b", "a"], }, index=["a1", "a4", "a7"] ) .. code:: ipython3 trepo.replace_records(df2) # replaces based on index .. code:: ipython3 trepo.get_full_df() .. raw:: html
A B C C2
a2 2 y 2 a
a1 21 X 10 a
a7 23 Z 1 a
a3 3 z 1 b
a4 22 Y 20 b
a5 5 x2 1 c
a6 6 x3 2 c
.. code:: ipython3 trepo.replace_groups(df2) .. code:: ipython3 trepo.get_full_df() # replaced the whole groups where C2==a and C2==b with the records that were present in df2 .. raw:: html
A B C C2
a1 21 X 10 a
a7 23 Z 1 a
a4 22 Y 20 b
a5 5 x2 1 c
a6 6 x3 2 c
.. code:: ipython3 trepo.replace_all(df2) # erases everything and puts df2 in. all traces of df are lost .. code:: ipython3 trepo.get_full_df() .. raw:: html
A B C C2
a1 21 X 10 a
a7 23 Z 1 a
a4 22 Y 20 b
.. code:: ipython3 trepo.replace_records(df, by_groups=True) # replaces records based on index, but only looks for indices within groups, so this way duplicate a4 index is possible # as they are in different groups, with different values in C2 .. code:: ipython3 trepo.get_full_df() .. raw:: html
A B C C2
a7 23 Z 1 a
a1 1 x 1 a
a2 2 y 2 a
a4 4 x1 1 a
a4 22 Y 20 b
a3 3 z 1 b
a5 5 x2 1 c
a6 6 x3 2 c
.. code:: ipython3 trepo.purge() # deletes everything