
Mathias Asplin Big data analytics and processing
df dataio = pd.read csv( \
’DataIO.csv’, sep=’;’ , names=names, dtype=schema, header=None).round(2)
df dataio [ ’Ratio’] = df dataio.CpuTime / df dataio.WallTime
df tables += transform dataframes( \
[ df dataio ], df dataio .Library.unique(), ’WallTime’)
# Data exploration
df dataexploration = pd.read csv( \
’DataExploration.csv’, sep=’;’ , names=names, dtype=schema, header=None).round(2)
df dataexploration [ ’Ratio’] = df dataexploration.CpuTime / df dataexploration.WallTime
df bigdata = pd.read csv(’BigData.csv’, \
sep=’;’ , names=names, dtype=schema, header=None).round(2)
df bigdata[ ’Ratio’] = df bigdata.CpuTime / df bigdata.WallTime
df medium = df bigdata[df bigdata.Operation.str.contains(’medium’, case=False, regex=True)].copy()
df large = df bigdata[(df bigdata.Operation.str.contains(’ large ’ , case=False, regex=True)) &
(˜df bigdata.Operation.str.contains( ’very’ , case=False, regex=True))].copy()
df verylarge = df bigdata[df bigdata.Operation.str. contains( ’ verylarge ’ , case=False, regex=True)].copy()
df many = df bigdata[df bigdata.Operation.str.contains(’many’, case=False, regex=True)].copy()
df list = [df dataexploration, df medium, df large, df verylarge , df many]
df tables += transform dataframes(df list, df dataexploration.Library.unique(), ’WallTime’)
for iin range(8):
df tables [ i ] = df tables [ i ]. round(3)
df tables [ i ]. columns = df tables[i ]. columns.str. replace( ’ ’ , ’ ’ )
df tables [ i ]. index = df tables[ i ]. index. str .replace( ’ ’ , ’ ’ )
for iin range(3, 8):
df tables [ i ]. columns = [f’Task {i}’for iin range(1, 19)]
df sub table = df tables [0]. loc [[ ’Pandas python’, ’Pandas c’, ’Pandas numpy’]]
df sub table = df sub table. fillna ( df sub table .min())
df dataio = df tables [0]. copy()
df dataio . loc [[ ’Pandas python’, ’Pandas c’, ’Pandas numpy’]] = df sub table
df read merge sum = df dataio.sum(axis=1).drop(
[ ’Pandas c’, ’Pandas python’]
). reset index (name = ’Read Merge’)
df data wrangling sum = df tables[1].sum(axis=1).reset index(name = ’Data Wrangling’)
df read write csv sum = df tables [2][
[ ’Read csv’, ’Write csv’ ]
]. sum(axis=1, min count=1).reset index(name = ’Read Write Csv’)
df read write parquet pa sum = df tables [2]. drop(
[ ’Polars outofcore’ ]
)[
[ ’Read pyarrow parquet’, ’Write pyarrow parquet’]
]. sum(axis=1, min count=1).reset index(name = ’R/W Parquet (PA)’)
df read write parquet r sum = df tables [2]. loc [[ ’Polars inmem’, ’Polars outofcore’ ]][
[ ’Read rust parquet’, ’Write rust parquet’]
]. sum(axis=1).reset index(name = ’R/W Parquet (R)’)
df merged = reduce(lambda left, right: pd.merge(left, right ,on=[’Library’ ], how=’outer’, sort=True),
[
df read merge sum,
df data wrangling sum,
df read write csv sum,
df read write parquet pa sum,
df read write parquet r sum
]). sort values (by = ’Library’)
df tables .append(df merged.set index(’Library’))
df merged = df merged.set index(’Library’)
df largest = df merged.apply(pd.Series.nsmallest, n=1, axis=0).reindex(df merged.index, axis=0)
df merged = df merged.reset index(’Library’)
df largest [˜ df largest . isnull ()] = 1
df tables .append(df largest)
df merged = reduce(lambda left, right: pd.merge(left, right ,on=[’Library’ ], how=’outer’, sort=True),
91