Speed up Pandas cummin/cummax

Question

Pandas cummin and cummax functions seem to be really slow for my use case with many groups. How can I speed them up?

Update

import pandas as pd
import numpy as np

from collections import defaultdict

def cummax(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').cummax()
    result = result.values
    return result


def transform(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').transform(lambda x: x.cummax())
    result = result.values
    return result

def itertuples(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
        return d[g][-1]

    for row in df.itertuples(index=True):
        index = row[0]
        result[index] = d_(row[1], row[2:])

    result = np.asarray(result)
    return result

def numpy(g, v):
    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = np.maximum(d[g][-2], d[g][-1])
        return d[g][-1]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    result = np.asarray(result)
    return result



LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)

%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)

gives

1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop

Do you have any further suggestions how I can improve my code?

Old

import pandas as pd
import numpy as np

LENGTH = 100000

df = pd.DataFrame(
    np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
    columns=['group', 'value'])

df.groupby('group').cummax()

Show source
| numpy   | pandas   | python   | performance   2017-01-07 21:01 2 Answers

Answers ( 2 )

  1. 2017-01-07 22:01

    We'll use defaultdict where the default value will be -np.inf because I'll be taking max values and I want a default that everything is greater than.

    solution

    Given an array of groups g and values to accumulate maximums v

    def pir1(g, v):
        d = defaultdict(lambda: -np.inf)
        result = np.empty(len(g))
    
        def d_(g, v):
            d[g] = max(d[g], v)
            return d[g]
    
        for i in range(len(g)):
            result[i] = d_(g[i], v[i])
    
        return result
    

    demonstration

    LENGTH = 100000
    g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
    v = np.random.rand(LENGTH)
    

    accuracy

    vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
    vm.eq(pir1(g, v)).all()
    
    True
    



    Deep Dive

    Compare with Divakar's answer

    headline chart
    enter image description here

    code
    I took some liberties with Divakar's function to make it accurate.

    %%cython
    import numpy as np
    from collections import defaultdict
    
    # this is a cythonized version of the next function
    def pir1(g, v):
        d = defaultdict(lambda: -np.inf)
        result = np.empty(len(g))
    
        def d_(g, v):
            d[g] = max(d[g], v)
            return d[g]
    
        for i in range(len(g)):
            result[i] = d_(g[i], v[i])
    
        return result
    

    def pir2(g, v):
        d = defaultdict(lambda: -np.inf)
        result = np.empty(len(g))
    
        def d_(g, v):
            d[g] = max(d[g], v)
            return d[g]
    
        for i in range(len(g)):
            result[i] = d_(g[i], v[i])
    
        return result
    
    def argsort_unique(idx):
        # Original idea : http://stackoverflow.com/a/41242285/3293881 
        n = idx.size
        sidx = np.empty(n,dtype=int)
        sidx[idx] = np.arange(n)
        return sidx
    
    def div1(groupby, value): 
        sidx = np.argsort(groupby,kind='mergesort')
        sorted_groupby, sorted_value = groupby[sidx], value[sidx]
    
        # Get shifts to be used for shifting each group
        mx = sorted_value.max() + 1
        shifts = sorted_groupby * mx
    
        # Shift and get max accumlate along value col. 
        # Those shifts helping out in keeping cummulative max within each group.
        group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
        return group_cummaxed[argsort_unique(sidx)]
    
    def div2(groupby, value):
        sidx = np.argsort(groupby, kind='mergesort')
        sorted_groupby, sorted_value = groupby[sidx], value[sidx]
        # factorize groups to integers
        sorted_groupby = np.append(
            0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()
    
        # Get shifts to be used for shifting each group
        mx = sorted_value.max() + 1
        shifts = (sorted_groupby - sorted_groupby.min()) * mx 
    
        # Shift and get max accumlate along value col. 
        # Those shifts helping out in keeping cummulative max within each group.
        group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
        return group_cummaxed[argsort_unique(sidx)]
    

    NOTES:

    • It was necessary to factorize the groups in Divakar's solution to generalize it

    accuracy

    integer groups
    Over integer based groups, both div1 and div2 produce the same results

    np.isclose(div1(g, v), pir1(g, v)).all()
    
    True
    
    np.isclose(div2(g, v), pir1(g, v)).all()
    
    True
    

    general groups
    Over string and float based groups div1 becomes inaccurate but easily fixed

    g = g / 1000000
    
    np.isclose(div1(g, v), pir1(g, v)).all()
    
    False
    
    np.isclose(div2(g, v), pir1(g, v)).all()
    
    True
    

    time testing

    results = pd.DataFrame(
        index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
        columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))
    
    size_map = dict(Large=100, Medium=10, Small=1)
    
    from timeit import timeit
    
    for i in results.index:
        for j in results.columns:
            results.set_value(
                i, j,
                timeit(
                    '{}(g // size_map[j], v)'.format(i),
                    'from __main__ import {}, size_map, g, v, j'.format(i),
                    number=100
                )
            )
    
    results
    

    enter image description here

    results.T.plot.barh()
    

    enter image description here

  2. 2017-01-08 11:01

    Proposed approach

    Let's bring some NumPy magic to the table! Well, we will exploit np.maximum.accumulate.

    Explanation

    To see how maximum.accumulate could help us, let's assume we have the groups lined up sequentially.

    Let's consider a sample grouby :

    grouby column : [0, 0, 0, 1, 1, 2, 2, 2, 2, 2]
    

    Let's consider a sample value :

    value column  : [3, 1, 4, 1, 3, 3, 1, 5, 2, 4]
    

    Using maximum.accumulate simply on value won't give us the desired output, as we need to do these accumulations only within each group. To do so, one trick would be to offset each group from the group before it.

    There could be few methods to do that offsetting work. One easy way would be to offset each group with an offset of max of value + 1 more than the previous one. For the sample, that offset would be 6. So, for the second group, we will add 6, third one as 12 and so on. Thus, the modfied value would be -

    value column  : [3, 1, 4, 7, 9, 15, 13, 17, 14, 16]
    

    Now, we can use maximum.accumulate and the accumulations would be restricted within each group -

    value cummaxed: [3, 3, 4, 7, 9, 15, 15, 17, 17, 17])
    

    To go back to the original values, subtract the offsets that were added before.

    value cummaxed: [3, 3, 4, 1, 3, 3, 3, 5, 5, 5])
    

    That's our desired result!

    At the start, we assumed the groups to be sequential. To get the data in that format, we will use np.argsort(groupby,kind='mergesort') to get the sorted indices such that it keeps the order for the same numbers and then use these indices to index into groupby column.

    To account for negative groupby elements, we just need to offset by max() - min() rather than just max().

    Thus, the implementation would look something like this -

    def argsort_unique(idx):
        # Original idea : http://stackoverflow.com/a/41242285/3293881 
        n = idx.size
        sidx = np.empty(n,dtype=int)
        sidx[idx] = np.arange(n)
        return sidx
    
    def numpy_cummmax(groupby, value, factorize_groupby=0): 
        # Inputs : 1D arrays.
    
        # Get sorted indices keeping the order. Sort groupby and value cols.
        sidx = np.argsort(groupby,kind='mergesort')
        sorted_groupby, sorted_value = groupby[sidx], value[sidx]
    
        if factorize_groupby==1:
            sf = np.concatenate(([0], np.flatnonzero(sorted_groupby[1:] != \
                                sorted_groupby[:-1])+1, [sorted_groupby.size] ))
            sorted_groupby = np.repeat(np.arange(sf.size-1), sf[1:] - sf[:-1])
    
        # Get shifts to be used for shifting each group
        mx = sorted_groupby.max()-sorted_groupby.min()+1
        shifts = sorted_groupby*mx
    
        # Shift and get max accumlate along value col. 
        # Those shifts helping out in keeping cummulative max within each group.
        group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
        return group_cummaxed[argsort_unique(sidx)]
    

    Runtime test and verification

    Verification

    1) Groupby as ints :

    In [58]: # Setup with groupby as ints
        ...: LENGTH = 1000
        ...: g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
        ...: v = np.random.rand(LENGTH)
        ...: 
    
    In [59]: df = pd.DataFrame(np.column_stack((g,v)),columns=['group', 'value'])
    
    In [60]: # Verify results
        ...: out1 = df.groupby('group').cummax()
        ...: out2 = numpy_cummmax(df['group'].values, df['value'].values)
        ...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
        ...: 
    True
    

    2) Groupby as floats :

    In [10]: # Setup with groupby as floats
        ...: LENGTH = 100000
        ...: df = pd.DataFrame(np.random.randint(0,LENGTH//2,(LENGTH,2))/10.0, \
        ...:                             columns=['group', 'value'])
    
    In [18]: # Verify results
        ...: out1 = df.groupby('group').cummax()
        ...: out2 = numpy_cummmax(df['group'].values, df['value'].values, factorize_groupby=1)
        ...: print np.allclose(out1.values.ravel(), out2, atol=1e-5)
        ...: 
    True
    

    Timings -

    1) Groupby as ints (same as the setup used for timings in the question) :

    In [24]: LENGTH = 100000
        ...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
        ...: v = np.random.rand(LENGTH)
        ...: 
    
    In [25]: %timeit numpy(g, v) # Best solution from posted question
    1 loops, best of 3: 373 ms per loop
    
    In [26]: %timeit pir1(g, v) # @piRSquared's solution-1
    1 loops, best of 3: 165 ms per loop
    
    In [27]: %timeit pir2(g, v) # @piRSquared's solution-2
    1 loops, best of 3: 157 ms per loop
    
    In [28]: %timeit numpy_cummmax(g, v)
    100 loops, best of 3: 18.3 ms per loop
    

    2) Groupby as floats :

    In [29]: LENGTH = 100000
        ...: g = np.random.randint(0,LENGTH//2,(LENGTH))/10.0
        ...: v = np.random.rand(LENGTH)
        ...: 
    
    In [30]: %timeit pir1(g, v) # @piRSquared's solution-1
    1 loops, best of 3: 157 ms per loop
    
    In [31]: %timeit pir2(g, v) # @piRSquared's solution-2
    1 loops, best of 3: 156 ms per loop
    
    In [32]: %timeit numpy_cummmax(g, v, factorize_groupby=1)
    10 loops, best of 3: 20.8 ms per loop
    
    In [34]: np.allclose(pir1(g, v),numpy_cummmax(g, v, factorize_groupby=1),atol=1e-5)
    Out[34]: True
    
◀ Go back