Quickstart
==========
.. code:: ipython3
import pandas as pd
from parquetranger import TableRepo
.. code:: ipython3
df = pd.DataFrame(
{
"A": [1, 2, 3, 4, 5, 6],
"B": ["x", "y", "z", "x1", "x2", "x3"],
"C": [1, 2, 1, 1, 1, 2],
"C2": ["a", "a", "b", "a", "c", "c"],
},
index=["a1", "a2", "a3", "a4", "a5", "a6"],
)
.. code:: ipython3
df
.. raw:: html
|
A |
B |
C |
C2 |
| a1 |
1 |
x |
1 |
a |
| a2 |
2 |
y |
2 |
a |
| a3 |
3 |
z |
1 |
b |
| a4 |
4 |
x1 |
1 |
a |
| a5 |
5 |
x2 |
1 |
c |
| a6 |
6 |
x3 |
2 |
c |
.. code:: ipython3
trepo = TableRepo("some_tmp_path", group_cols="C2") # this creates the directory
.. code:: ipython3
trepo.extend(df)
.. code:: ipython3
trepo.get_full_df()
.. raw:: html
|
A |
B |
C |
C2 |
| a1 |
1 |
x |
1 |
a |
| a2 |
2 |
y |
2 |
a |
| a4 |
4 |
x1 |
1 |
a |
| a3 |
3 |
z |
1 |
b |
| a5 |
5 |
x2 |
1 |
c |
| a6 |
6 |
x3 |
2 |
c |
.. code:: ipython3
df2 = pd.DataFrame(
{
"A": [21, 22, 23],
"B": ["X", "Y", "Z"],
"C": [10,20,1],
"C2": ["a", "b", "a"],
},
index=["a1", "a4", "a7"]
)
.. code:: ipython3
trepo.replace_records(df2) # replaces based on index
.. code:: ipython3
trepo.get_full_df()
.. raw:: html
|
A |
B |
C |
C2 |
| a2 |
2 |
y |
2 |
a |
| a1 |
21 |
X |
10 |
a |
| a7 |
23 |
Z |
1 |
a |
| a3 |
3 |
z |
1 |
b |
| a4 |
22 |
Y |
20 |
b |
| a5 |
5 |
x2 |
1 |
c |
| a6 |
6 |
x3 |
2 |
c |
.. code:: ipython3
trepo.replace_groups(df2)
.. code:: ipython3
trepo.get_full_df() # replaced the whole groups where C2==a and C2==b with the records that were present in df2
.. raw:: html
|
A |
B |
C |
C2 |
| a1 |
21 |
X |
10 |
a |
| a7 |
23 |
Z |
1 |
a |
| a4 |
22 |
Y |
20 |
b |
| a5 |
5 |
x2 |
1 |
c |
| a6 |
6 |
x3 |
2 |
c |
.. code:: ipython3
trepo.replace_all(df2) # erases everything and puts df2 in. all traces of df are lost
.. code:: ipython3
trepo.get_full_df()
.. raw:: html
|
A |
B |
C |
C2 |
| a1 |
21 |
X |
10 |
a |
| a7 |
23 |
Z |
1 |
a |
| a4 |
22 |
Y |
20 |
b |
.. code:: ipython3
trepo.replace_records(df, by_groups=True) # replaces records based on index, but only looks for indices within groups, so this way duplicate a4 index is possible
# as they are in different groups, with different values in C2
.. code:: ipython3
trepo.get_full_df()
.. raw:: html
|
A |
B |
C |
C2 |
| a7 |
23 |
Z |
1 |
a |
| a1 |
1 |
x |
1 |
a |
| a2 |
2 |
y |
2 |
a |
| a4 |
4 |
x1 |
1 |
a |
| a4 |
22 |
Y |
20 |
b |
| a3 |
3 |
z |
1 |
b |
| a5 |
5 |
x2 |
1 |
c |
| a6 |
6 |
x3 |
2 |
c |
.. code:: ipython3
trepo.purge() # deletes everything