Waveform analysis - exploiting numpy functions

In this example a peak search is performed just using internal numpy functions (no for loops).

List of useful functions:

  • np.diff - used to compute the 1st derivative
  • np.roll - used to move the array 1 (or more) index forward (or backward) to search for crossing points
  • np.searchsorted - used to deal with "starts" and "stops" of different size.
  • np.ufunc.reduceat - used to sum over a set of ranges (array of starts and stops)
In [1]:
import numpy as np
import matplotlib.pyplot as plt
In [2]:
wf = np.load('sample_expo300.npz')['wf']
# wf = np.load('data/poiss500.npz')['wf']
In [3]:
%matplotlib inline
In [4]:
fig, ax = plt.subplots(figsize=(18,4))
ax.plot(wf[:2000], color='k')
plt.show()

Search of the pulses using the 1st derivative

In [5]:
wfDER = np.diff(wf, n=1)
In [6]:
fig, ax = plt.subplots(figsize=(18,4))
ax.plot(wf[:2000], color='k', alpha=0.4)
ax2 = ax.twinx()
ax2.tick_params(axis ='y', labelcolor = 'b')
ax2.plot(wfDER[:2000], color='b')
plt.show()

Search of the start: beginning of the light pulse. It happens when the 1st derivative goes beyond a given threshold

Search of the stop: end of the light pulse. It happens when the 1st derivative goes below a given threshold

In [7]:
thrUP   = 25
thrDOWN = 0

crossesUP = (wfDER>thrUP) & (np.roll(wfDER,1)<=thrUP) 
start,  = np.where(crossesUP)  

crossesDOWN = (wfDER<thrDOWN) & (np.roll(wfDER,1)>=thrDOWN) 
stop,  = np.where(crossesDOWN)
In [8]:
time = np.arange(wf.size)  # simple time array

fig, ax = plt.subplots(figsize=(18,4))
ax.plot(wf, color='k', alpha=0.4)
ax2 = ax.twinx()
ax2.tick_params(axis ='y', labelcolor = 'b')
ax2.plot(wfDER, color='b')
ax2.plot(start, np.ones(start.size)*110, '>g', label='Start')
ax2.axhline(thrUP, color='g', linestyle='--', linewidth=1)
ax2.plot(stop, np.ones(stop.size)*90, '<r', label='Stop')
ax2.axhline(thrDOWN, color='r', linestyle='--', linewidth=1)
plt.legend()
plt.show()

Zoom

In [9]:
fig, ax = plt.subplots(figsize=(18,4))
ax.plot(wf[:500], color='k', alpha=0.4)
ax2 = ax.twinx()
ax2.tick_params(axis ='y', labelcolor = 'b')
ax2.plot(wfDER[:500], color='b')
ax2.plot(start[0], np.ones(1)*110, '>g', label='Start')
ax2.axhline(thrUP, color='g', linestyle='--', linewidth=1)
ax2.plot(stop[:100], np.ones(100)*90, '<r', label='Stop')
ax2.axhline(thrDOWN, color='r', linestyle='--', linewidth=1)
plt.legend()
plt.show()

A possible problem is that there could be more stops than starts.

This problem can be solved by using a function that scan along time the waveform:

In [10]:
def scan_and_find_pulses(wf, thrUP=30, thrDOWN=0):
    starts, stops = [], []
    pulseON = False

    for i in range(wfDER.size):
        if i>0:
            if (wfDER[i]>thrUP) & (wfDER[i-1]<=thrUP) & (not pulseON):
                starts.append(i)
                pulseON = True
            if (wfDER[i]<thrDOWN) & (wfDER[i-1]>=thrDOWN) & pulseON:
                stops.append(i)
                pulseON = False
    starts = np.array(starts)
    stops = np.array(stops)
    return np.vstack((starts, stops))
In [11]:
scan_and_find_pulses(wfDER, 30, 0)
Out[11]:
array([[  414,   884,  1004,  1733,  1743,  2361,  2452,  3291,  3775,
         3983,  4132,  4266,  4285,  4316,  4381,  4510,  4720,  4855,
         4998,  5317,  5682,  6028,  6409,  6592,  6758,  7402,  7974,
         8001,  8310,  8738,  8747,  9602, 10227, 10576, 10871, 10967,
        11126, 11275, 11593, 11616, 12282, 12643, 12665, 12673],
       [  425,   895,  1016,  1742,  1746,  2369,  2463,  3302,  3783,
         3994,  4143,  4269,  4288,  4326,  4391,  4521,  4732,  4869,
         5009,  5329,  5688,  6037,  6420,  6600,  6765,  7413,  7986,
         8008,  8316,  8746,  8750,  9612, 10237, 10585, 10881, 10977,
        11130, 11285, 11604, 11625, 12295, 12656, 12672, 12679]])

How to match the 2 arrays without looping on them?

A simple example

In [12]:
starts = np.array([1, 10, 30, 50, 95])
stops  = np.array([6, 15, 19, 35, 39, 67, 110])

Graphical representation

In [13]:
plt.subplots(figsize=(12,1))
plt.plot(starts, np.ones(starts.size)*5, '>g')
for i in range(starts.size):
    plt.text(starts[i], 0,f'{starts[i]}', {'color':'g', 'ha':'center'})
plt.plot(stops, np.ones(stops.size)*4, '<r')
for i in range(stops.size):
    plt.text(stops[i], 0,f'{stops[i]}', {'color':'r', 'ha':'center'})
    
plt.ylim(-3,6)
plt.axis('off')
plt.show()

The numpy searchsorted() function can be used. For each element of the second array (stops), find where it should be inserted in the first array to be properly sorted.

The return value is an index array with the same shape of stops

In [14]:
np.searchsorted(starts, stops)
Out[14]:
array([1, 2, 2, 3, 3, 4, 5])

Notice that where the stops are not folloing a start, the index does not change. The, if a np.diff() is calculated on this array (with a prepend=0 argument), a series of 1s and 0s will be returned where 1s indicate the position of a "real" stop.

In [15]:
np.diff( np.searchsorted(starts, stops), prepend=0)
Out[15]:
array([1, 1, 0, 1, 0, 1, 1])

In order to have the list of index where this happens the np.where() function can be used:

In [16]:
np.where( np.diff( np.searchsorted(starts, stops), prepend=0) )
Out[16]:
(array([0, 1, 3, 5, 6]),)

Visually:

In [17]:
real_stops, = np.where( np.diff( np.searchsorted(starts, stops), prepend=0) )

plt.subplots(figsize=(12,1))
plt.plot(starts, np.ones(starts.size)*5, '>g')
for i in range(starts.size):
    plt.text(starts[i], 0,f'{starts[i]}', {'color':'g', 'ha':'center'})
plt.plot(stops, np.ones(stops.size)*4, '<r')
for i in range(stops.size):
    plt.text(stops[i], 0,f'{stops[i]}', {'color':'r', 'ha':'center'})
#     plt.text(stops[i], -2,f'{np.searchsorted(starts, stops)[i]}', {'color':'r', 'ha':'center'})
    plt.text(stops[i], -4,f'{np.diff( np.searchsorted(starts, stops), prepend=0)[i]}', {'color':'r', 'ha':'center'})

plt.ylim(-3,6)

for p in stops[real_stops]:
    plt.gca().add_artist(plt.Circle((p, 0.5), 2, fill=False, edgecolor='red'))
plt.axis('off')
plt.show()

What happen if one start is not followed by a stop?

In [18]:
starts = np.array([1, 10, 30, 50, 56, 95, 100])
stops  = np.array([6, 15, 19, 35, 39, 67, 110])
In [19]:
np.searchsorted(starts, stops)
Out[19]:
array([1, 2, 2, 3, 3, 5, 7])
In [20]:
np.diff( np.searchsorted(starts, stops), prepend=0)
Out[20]:
array([1, 1, 0, 1, 0, 2, 2])

There is a number greater that one in the diff array. But how can a "start without a stop" be detected?

Using the searchsorted the other way around and spotting for repeated numbers...

In [21]:
np.searchsorted(stops, starts)
Out[21]:
array([0, 1, 3, 5, 5, 6, 6])

... and using the diff to highlight them (whenever a start is preceded by another start, the first is the one to be kept):

In [22]:
np.diff(np.searchsorted(stops, starts), prepend=-1)
Out[22]:
array([1, 1, 2, 2, 0, 1, 0])
In [23]:
real_starts, = np.where(np.diff(np.searchsorted(stops, starts), prepend=-1))
real_starts
Out[23]:
array([0, 1, 2, 3, 5])
In [24]:
real_stops, = np.where( np.diff( np.searchsorted(starts, stops), prepend=0) )

plt.subplots(figsize=(12,1))
plt.plot(starts, np.ones(starts.size)*5, '>g')
for i in range(starts.size):
    plt.text(starts[i], 0,f'{starts[i]}', {'color':'g', 'ha':'center'})
    plt.text(starts[i], -4,f'{np.diff( np.searchsorted(stops, starts), prepend=-1)[i]}', {'color':'g', 'ha':'center'})
plt.plot(stops, np.ones(stops.size)*4, '<r')
    
for i in range(stops.size):
    plt.text(stops[i], 0,f'{stops[i]}', {'color':'r', 'ha':'center'})
#     plt.text(stops[i], -2,f'{np.searchsorted(starts, stops)[i]}', {'color':'r', 'ha':'center'})
    plt.text(stops[i], -4,f'{np.diff( np.searchsorted(starts, stops), prepend=0)[i]}', {'color':'r', 'ha':'center'})

plt.ylim(-3,6)

for p in stops[real_stops]:
    plt.gca().add_artist(plt.Circle((p, 0.5), 2, fill=False, edgecolor='red'))
for p in starts[real_starts]:
    plt.gca().add_artist(plt.Circle((p, 0.5), 2, fill=False, edgecolor='green'))
plt.axis('off')
plt.show()

Then the two arrays of "unique" coupled starts and stops can be vstacked

In [25]:
np.vstack((starts[real_starts], stops[real_stops]))
Out[25]:
array([[  1,  10,  30,  50,  95],
       [  6,  15,  35,  67, 110]])

Back to the waveform

Let's build a function which returs a 2D array of matched starts and stops

In [26]:
def get_pulses(wf, thrUP=30, thrDOWN=0):
    starts,  = np.where((wfDER>thrUP) & (np.roll(wfDER,1)<=thrUP) )  
    stops,  = np.where((wfDER<thrDOWN) & (np.roll(wfDER,1)>=thrDOWN) )
    
    real_starts, = np.where(np.diff(np.searchsorted(stops, starts), prepend=-1))
    real_stops, = np.where( np.diff( np.searchsorted(starts, stops), prepend=0) )
    return np.vstack((starts[real_starts], stops[real_stops]))
In [27]:
pulses = get_pulses(wfDER, 50, 0)
pulses.shape
Out[27]:
(2, 38)

Now that the pulses' location is available, a way of measuring the pulse amplitude can be a sum in that precise ranges. For example the 1st derivative integral can be used.

We can do it "manually" by a for loop:

In [28]:
pulses_ampl = []

# loop on all pulses
for i in range(pulses.shape[1]):
    tot = 0
    # loop on the pulse range
    p_start = pulses[0,i]
    p_stop  = pulses[1,i]
    for j in range(p_start, p_stop + 1):
        tot += wfDER[j]

    pulses_ampl.append(tot)

Or we can implement a faster function using the numpy.reduceat() function which extend an operation (e.g. "add") to a given set of indeces.

Before, the arrays of starts and stops must be merged "inline" in a single array:

In [29]:
# creating an empty array with 2 x start shape (better to force the dtype since it has to be used later as index array)
alternated = np.zeros(pulses.shape[1]*2, dtype='int')

# filling the array alternatively with start and stop, start and stop, ...
alternated[::2] = pulses[0,:]
alternated[1::2] = pulses[1,:]
In [30]:
print(pulses.shape, alternated.shape)
(2, 38) (76,)

The same result can be obtained by using numpy.flatten() with "F" ordering:

In [31]:
alternated = pulses.flatten('F')

Now the "add" operation can be applied repeat repetitively across an array of index, taking the results every other index ([::2])

In [32]:
amplitudes = np.add.reduceat(wfDER, alternated)[::2]
amplitudes.shape
Out[32]:
(38,)

Final test: explicits loops vs numpy-only functions

In [33]:
# wf = np.load('sample_expo300.npz')['wf']
wf = np.load('data/expo500.npz')['wf']
wfDER = np.diff(wf, n=1)

Explicit loops

In [34]:
def find_pulses_explicit(wfDER, thrUP=30, thrDOWN=0):
    starts, stops = [], []
    pulseON = False

    for i in range(wfDER.size):
        if i>0:
            if (wfDER[i]>thrUP) & (wfDER[i-1]<=thrUP) & (not pulseON):
                starts.append(i)
                pulseON = True
            if (wfDER[i]<thrDOWN) & (wfDER[i-1]>=thrDOWN) & pulseON:
                stops.append(i)
                pulseON = False
    starts = np.array(starts)
    stops = np.array(stops)
    return np.vstack((starts, stops))



def find_amplitudes_explicit(wfDER, thrUP=30, thrDOWN=0):
    pulses_ampl = []
    
    pulses = find_pulses_explicit(wfDER, thrUP, thrDOWN)
    # loop on all pulses
    for i in range(pulses.shape[1]):
        tot = 0
        # loop on the pulse range
        p_start = pulses[0,i]
        p_stop  = pulses[1,i]
        for j in range(p_start, p_stop + 1):
            tot += wfDER[j]

        pulses_ampl.append(tot)
    return np.vstack((np.array(pulses_ampl), pulses))
In [35]:
%time ps1 = find_amplitudes_explicit(wfDER, thrUP=50, thrDOWN=0)
CPU times: user 5.4 s, sys: 18.8 ms, total: 5.41 s
Wall time: 5.45 s

Numpy only

In [36]:
def find_pulses_numpy(wfDER, thrUP=30, thrDOWN=0):
    '''
    This function returns a (2,N) array with N being the number of the found pulses and:
    [0,:] = start of the pulse
    [1,:] = stop  of the pulse
    '''
    starts,  = np.where((wfDER>thrUP) & (np.roll(wfDER,1)<=thrUP) )  
    stops,  = np.where((wfDER<thrDOWN) & (np.roll(wfDER,1)>=thrDOWN) )
    
    real_starts, = np.where(np.diff(np.searchsorted(stops, starts), prepend=-1))
    real_stops, = np.where( np.diff( np.searchsorted(starts, stops), prepend=0) )
    return np.vstack((starts[real_starts], stops[real_stops]))

def find_amplitudes_numpy(wfDER, thrUP=30, thrDOWN=0):
    '''
    This function returns a (3,N) array with N being the number of the found pulses and:
    [0,:] = amplitude of the pulse
    [1,:] = start of the pulse
    [2,:] = stop of the pulse
    '''
    pulses = find_pulses_numpy(wfDER, thrUP, thrDOWN)
    ampl = np.add.reduceat(wfDER, pulses.flatten('F'))[::2]
    return np.vstack((ampl, pulses))
    
    
    
In [37]:
%time ps2 = find_amplitudes_numpy(wfDER, thrUP=50, thrDOWN=0)
CPU times: user 12.6 ms, sys: 38 µs, total: 12.6 ms
Wall time: 11.8 ms

Further checks / debug

In [38]:
print(ps1.shape, ps1.dtype, ps2.shape,  ps2.dtype)
(3, 928) int64 (3, 928) int64

Found amplitudes

In [39]:
fig, ax = plt.subplots(1,2,figsize=(12,5))
ax[0].hist(ps1[0,:], 50)
ax[1].hist(ps1[0,:], 50)
plt.show()

Found deltat

In [40]:
fig, ax = plt.subplots(1,2,figsize=(12,5))
ax[0].hist(np.diff(ps1[1,:]), 50, log=True)
ax[1].hist(np.diff(ps2[1,:]), 50, log=True)
plt.show()

Correlation between deltat and amplitude

In [41]:
fig, ax = plt.subplots(1,2,figsize=(12,5))
ax[0].hist2d(ps1[0,:], np.diff(ps1[1,:], prepend=0), 50)
ax[1].hist2d(ps1[0,:], np.diff(ps2[1,:], prepend=0), 50)
plt.show()

Further debug, start and stop comparison

In [42]:
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection

start, stop = 1000, 1100

fig, ax = plt.subplots(figsize=(14,3))
ax.plot(np.arange(start, stop),wfDER[start:stop], '-k', marker='.', lw=1)
ax.axhline(0, ls='--', color='r', lw=1)
ax.axhline(30, ls='--', color='r', lw=1)

ybot, ytop = ax.get_ylim()


# get pulses within the choosen range
rlist1=[]
for i in range(ps1.shape[1]):
    if (ps1[1,i]>start) & (ps1[1,i]<stop):
        rlist1.append( Rectangle((ps1[1,i], ybot), ps1[2,i]-ps1[1,i], (ytop-ybot)/2 ) )
pc1 = PatchCollection(rlist1, facecolor='cyan', alpha=0.1)

rlist2=[]
for i in range(ps2.shape[1]):
    if (ps2[1,i]>start) & (ps2[1,i]<stop):
        rlist2.append( Rectangle((ps2[1,i], ybot+(ytop-ybot)/2), ps2[2,i]-ps2[1,i], (ytop-ybot)/2 ) )
pc2 = PatchCollection(rlist2, facecolor='yellow', alpha=0.1)


plt.gca().add_collection(pc1)
plt.gca().add_collection(pc2)

plt.show()
In [ ]: