10SN/A/*	$NetBSD: epoch.c,v 1.2 2021/08/14 16:14:58 christos Exp $	*/
211447Sserb
30SN/A/* epoch.c - epoch based memory reclamation */
40SN/A/* $OpenLDAP$ */
50SN/A/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
60SN/A *
72362SN/A * Copyright 2018-2021 The OpenLDAP Foundation.
80SN/A * All rights reserved.
92362SN/A *
100SN/A * Redistribution and use in source and binary forms, with or without
110SN/A * modification, are permitted only as authorized by the OpenLDAP
120SN/A * Public License.
130SN/A *
140SN/A * A copy of this license is available in the file LICENSE in the
150SN/A * top-level directory of the distribution or, alternatively, at
160SN/A * <http://www.OpenLDAP.org/license.html>.
170SN/A */
180SN/A
190SN/A/** @file epoch.c
200SN/A *
212362SN/A * Implementation of epoch based memory reclamation, in principle
222362SN/A * similar to the algorithm presented in
232362SN/A * https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.pdf
240SN/A *
250SN/A * Not completely lock-free at the moment.
260SN/A *
270SN/A * Also the problems with epoch based memory reclamation are still
280SN/A * present - a thread actively observing an epoch getting stuck will
290SN/A * prevent managed objects (in our case connections and operations)
300SN/A * from being freed, potentially running out of memory.
310SN/A */
320SN/A
330SN/A#include <sys/cdefs.h>
340SN/A__RCSID("$NetBSD: epoch.c,v 1.2 2021/08/14 16:14:58 christos Exp $");
350SN/A
360SN/A#include "portable.h"
370SN/A
380SN/A#include "lload.h"
390SN/A#include <epoch.h>
400SN/A
410SN/A/* Has to be >= 3 */
420SN/A#define EPOCH_MASK ( 1 << 2 )
430SN/A#define EPOCH_PREV(epoch) ( ( (epoch) + EPOCH_MASK - 1 ) % EPOCH_MASK )
440SN/A#define EPOCH_NEXT(epoch) ( ( (epoch) + 1 ) % EPOCH_MASK )
450SN/A
460SN/Astruct pending_ref {
470SN/A    void *object;
480SN/A    dispose_cb *dispose;
490SN/A    struct pending_ref *next;
500SN/A};
510SN/A
520SN/Aldap_pvt_thread_rdwr_t epoch_mutex;
530SN/A
540SN/Astatic epoch_t current_epoch;
550SN/Astatic uintptr_t epoch_threads[EPOCH_MASK];
560SN/Astatic struct pending_ref *references[EPOCH_MASK];
570SN/A
580SN/Avoid
590SN/Aepoch_init( void )
600SN/A{
610SN/A    epoch_t epoch;
620SN/A
630SN/A    current_epoch = 0;
640SN/A    for ( epoch = 0; epoch < EPOCH_MASK; epoch++ ) {
650SN/A        assert( !epoch_threads[epoch] );
660SN/A        assert( !references[epoch] );
670SN/A    }
680SN/A
690SN/A    ldap_pvt_thread_rdwr_init( &epoch_mutex );
700SN/A}
710SN/A
720SN/Avoid
730SN/Aepoch_shutdown( void )
740SN/A{
750SN/A    epoch_t epoch;
760SN/A    struct pending_ref *old, *next;
770SN/A
780SN/A    for ( epoch = 0; epoch < EPOCH_MASK; epoch++ ) {
790SN/A        assert( !epoch_threads[epoch] );
800SN/A    }
810SN/A
820SN/A    /*
830SN/A     * Even with the work in epoch_leave(), shutdown code doesn't currently
840SN/A     * observe any epoch, so there might still be references left to free.
850SN/A     */
860SN/A    epoch = EPOCH_PREV(current_epoch);
870SN/A    next = references[epoch];
880SN/A    references[epoch] = NULL;
890SN/A    for ( old = next; old; old = next ) {
900SN/A        next = old->next;
910SN/A
920SN/A        old->dispose( old->object );
930SN/A        ch_free( old );
940SN/A    }
950SN/A
960SN/A    epoch = current_epoch;
970SN/A    next = references[epoch];
980SN/A    references[epoch] = NULL;
990SN/A    for ( old = next; old; old = next ) {
1000SN/A        next = old->next;
1010SN/A
1020SN/A        old->dispose( old->object );
1030SN/A        ch_free( old );
1040SN/A    }
1050SN/A
1060SN/A    /* No references should exist anywhere now */
1070SN/A    for ( epoch = 0; epoch < EPOCH_MASK; epoch++ ) {
1089464SN/A        assert( !references[epoch] );
1090SN/A    }
1100SN/A
1110SN/A    ldap_pvt_thread_rdwr_destroy( &epoch_mutex );
1120SN/A}
1130SN/A
1140SN/Aepoch_t
1150SN/Aepoch_join( void )
1160SN/A{
1170SN/A    epoch_t epoch;
1180SN/A    struct pending_ref *old, *ref = NULL;
1190SN/A
1200SN/Aretry:
1210SN/A    /* TODO: make this completely lock-free */
1220SN/A    ldap_pvt_thread_rdwr_rlock( &epoch_mutex );
1230SN/A    epoch = current_epoch;
1240SN/A    __atomic_add_fetch( &epoch_threads[epoch], 1, __ATOMIC_ACQ_REL );
1250SN/A    ldap_pvt_thread_rdwr_runlock( &epoch_mutex );
1260SN/A
1270SN/A    if ( __atomic_load_n(
1280SN/A                 &epoch_threads[EPOCH_PREV(epoch)], __ATOMIC_ACQUIRE ) ) {
1290SN/A        return epoch;
1300SN/A    }
1310SN/A
1320SN/A    __atomic_exchange(
1330SN/A            &references[EPOCH_PREV(epoch)], &ref, &ref, __ATOMIC_ACQ_REL );
1340SN/A
1350SN/A    Debug( LDAP_DEBUG_TRACE, "epoch_join: "
1360SN/A            "advancing epoch to %zu with %s objects to free\n",
1370SN/A            EPOCH_NEXT(epoch), ref ? "some" : "no" );
1380SN/A
1390SN/A    ldap_pvt_thread_rdwr_wlock( &epoch_mutex );
1400SN/A    current_epoch = EPOCH_NEXT(epoch);
1410SN/A    ldap_pvt_thread_rdwr_wunlock( &epoch_mutex );
1420SN/A
1430SN/A    if ( !ref ) {
1440SN/A        return epoch;
1450SN/A    }
1460SN/A
1470SN/A    /*
1480SN/A     * The below is now safe to free outside epochs and we don't want to make
1490SN/A     * the current epoch last any longer than necessary.
1500SN/A     *
1510SN/A     * Looks like there might be fairness issues in massively parallel
1520SN/A     * environments but they haven't been observed on 32-core machines.
1530SN/A     */
1540SN/A    epoch_leave( epoch );
1550SN/A
1560SN/A    for ( old = ref; old; old = ref ) {
1570SN/A        ref = old->next;
1580SN/A
1590SN/A        old->dispose( old->object );
1600SN/A        ch_free( old );
1610SN/A    }
1620SN/A
1630SN/A    goto retry;
1640SN/A}
1650SN/A
1660SN/Avoid
1670SN/Aepoch_leave( epoch_t epoch )
1680SN/A{
1690SN/A    struct pending_ref *p, *next, *old_refs = NULL, *current_refs = NULL;
1700SN/A
1710SN/A    /* Are there other threads observing our epoch? */
1720SN/A    if ( __atomic_sub_fetch( &epoch_threads[epoch], 1, __ATOMIC_ACQ_REL ) ) {
1730SN/A        return;
1740SN/A    }
1750SN/A
1760SN/A    /*
1770SN/A     * Optimisation for the case when we're mostly idle. Otherwise we won't
1785336SN/A     * release resources until another thread comes by and joins the epoch
1795336SN/A     * (twice), and there's no idea how soon (or late) that is going to happen.
18010128SN/A     *
1815336SN/A     * NB. There is no limit to the number of threads executing the following
1825336SN/A     * code in parallel.
1835336SN/A     */
1846927SN/A    ldap_pvt_thread_rdwr_rlock( &epoch_mutex );
1850SN/A    /*
1860SN/A     * Anything could happen between the subtract and the lock being acquired
1870SN/A     * above, so check again. But once we hold this lock (and confirm no more
1880SN/A     * threads still observe either prospective epoch), noone will be able to
1890SN/A     * finish epoch_join until we've released epoch_mutex since it holds that:
1900SN/A     *
1910SN/A     * epoch_threads[EPOCH_PREV(current_epoch)] == 0
1920SN/A     *
1930SN/A     * and that leads epoch_join() to acquire a write lock on &epoch_mutex.
1940SN/A     */
1950SN/A    if ( __atomic_load_n( &epoch_threads[epoch], __ATOMIC_RELAXED ) ) {
1960SN/A        /* Epoch counter has run full circle */
1970SN/A        ldap_pvt_thread_rdwr_runlock( &epoch_mutex );
1980SN/A        return;
1990SN/A    } else if ( epoch == current_epoch ) {
2006927SN/A        if ( __atomic_load_n(
2016927SN/A                     &epoch_threads[EPOCH_PREV(epoch)], __ATOMIC_RELAXED ) ) {
2020SN/A            /* There is another (older) thread still running */
2030SN/A            ldap_pvt_thread_rdwr_runlock( &epoch_mutex );
2040SN/A            return;
2050SN/A        }
2066927SN/A
2070SN/A        /* We're all alone, it's safe to claim all references and free them. */
2080SN/A        __atomic_exchange( &references[EPOCH_PREV(epoch)], &old_refs,
2090SN/A                &old_refs, __ATOMIC_ACQ_REL );
2100SN/A        __atomic_exchange( &references[epoch], &current_refs, &current_refs,
2110SN/A                __ATOMIC_ACQ_REL );
2120SN/A    } else if ( epoch == EPOCH_PREV(current_epoch) ) {
2130SN/A        if ( __atomic_load_n(
2140SN/A                     &epoch_threads[EPOCH_NEXT(epoch)], __ATOMIC_RELAXED ) ) {
2150SN/A            /* There is another (newer) thread still running */
2160SN/A            ldap_pvt_thread_rdwr_runlock( &epoch_mutex );
2170SN/A            return;
2180SN/A        }
2190SN/A
2200SN/A        /* We're all alone, it's safe to claim all references and free them. */
2210SN/A        __atomic_exchange(
2220SN/A                &references[epoch], &old_refs, &old_refs, __ATOMIC_ACQ_REL );
2230SN/A        __atomic_exchange( &references[EPOCH_NEXT(epoch)], &current_refs,
2240SN/A                &current_refs, __ATOMIC_ACQ_REL );
2250SN/A    }
2260SN/A    /*
2270SN/A     * Else the current_epoch has moved far enough that no references remain to
2280SN/A     * be freed.
2290SN/A     */
2300SN/A    ldap_pvt_thread_rdwr_runlock( &epoch_mutex );
2310SN/A
2320SN/A    /*
2330SN/A     * Trigger a memory-independent read fence to make sure we're reading the
2340SN/A     * state after all threads actually finished - which might have happened
2350SN/A     * after we acquired epoch_mutex so ldap_pvt_thread_rdwr_rlock would not
2360SN/A     * catch everything.
2370SN/A     *
2380SN/A     * TODO is to confirm the below:
2390SN/A     * It might be that the tests and exchanges above only enforce a fence for
2400SN/A     * the locations affected, so we could still read stale memory for
2410SN/A     * unrelated locations? At least that's the only explanation I've been able
2420SN/A     * to establish for repeated crashes that seem to have gone away with this
2430SN/A     * in place.
2440SN/A     *
2450SN/A     * But then that's contrary to the second example in Acquire/Release
2460SN/A     * section here:
2470SN/A     * https://gcc.gnu.org/wiki/Atomic/GCCMM/AtomicSync
2480SN/A     */
2490SN/A    __atomic_thread_fence( __ATOMIC_ACQUIRE );
2500SN/A
2510SN/A    for ( p = old_refs; p; p = next ) {
2520SN/A        next = p->next;
2530SN/A
2540SN/A        p->dispose( p->object );
2550SN/A        ch_free( p );
2560SN/A    }
2570SN/A
2580SN/A    for ( p = current_refs; p; p = next ) {
2590SN/A        next = p->next;
2600SN/A
2610SN/A        p->dispose( p->object );
2620SN/A        ch_free( p );
2630SN/A    }
2640SN/A}
2650SN/A
2660SN/A/*
2670SN/A * Add the object to the "current global epoch", not the epoch our thread
2680SN/A * entered.
2690SN/A */
2700SN/Avoid
2710SN/Aepoch_append( void *ptr, dispose_cb *cb )
2720SN/A{
2730SN/A    struct pending_ref *new;
2740SN/A    epoch_t epoch = __atomic_load_n( &current_epoch, __ATOMIC_ACQUIRE );
2750SN/A
2760SN/A    /*
2770SN/A     * BTW, the following is not appropriate here:
2780SN/A     * assert( __atomic_load_n( &epoch_threads[epoch], __ATOMIC_RELAXED ) );
2790SN/A     *
2800SN/A     * We might be a thread lagging behind in the "previous epoch" with no
2810SN/A     * other threads executing at all.
2820SN/A     */
2830SN/A
2840SN/A    new = ch_malloc( sizeof(struct pending_ref) );
2850SN/A    new->object = ptr;
2860SN/A    new->dispose = cb;
2870SN/A    new->next = __atomic_load_n( &references[epoch], __ATOMIC_ACQUIRE );
2880SN/A
2890SN/A    while ( !__atomic_compare_exchange( &references[epoch], &new->next, &new, 0,
2900SN/A            __ATOMIC_RELEASE, __ATOMIC_RELAXED ) )
2910SN/A        /* iterate until we succeed */;
2920SN/A}
2930SN/A
2940SN/Aint
2950SN/Aacquire_ref( uintptr_t *refp )
2960SN/A{
2970SN/A    uintptr_t refcnt, new_refcnt;
2980SN/A
2990SN/A    refcnt = __atomic_load_n( refp, __ATOMIC_ACQUIRE );
3000SN/A
3010SN/A    /*
3020SN/A     * If we just incremented the refcnt and checked for zero after, another
3030SN/A     * thread might falsely believe the object was going to stick around.
3040SN/A     *
3050SN/A     * Checking whether the object is still dead at disposal time might not be
3060SN/A     * able to distinguish it from being freed in a later epoch.
3070SN/A     */
3080SN/A    do {
3090SN/A        if ( !refcnt ) {
3100SN/A            return refcnt;
3110SN/A        }
3120SN/A
3130SN/A        new_refcnt = refcnt + 1;
3140SN/A    } while ( !__atomic_compare_exchange( refp, &refcnt, &new_refcnt, 0,
3150SN/A            __ATOMIC_RELEASE, __ATOMIC_RELAXED ) );
3160SN/A    assert( new_refcnt == refcnt + 1 );
3170SN/A
3180SN/A    return refcnt;
3190SN/A}
3209177SN/A
3210SN/Aint
3220SN/Atry_release_ref( uintptr_t *refp, void *object, dispose_cb *cb )
3230SN/A{
3240SN/A    uintptr_t refcnt, new_refcnt;
3250SN/A
3260SN/A    refcnt = __atomic_load_n( refp, __ATOMIC_ACQUIRE );
3270SN/A
3280SN/A    /* We promise the caller that we won't decrease refcnt below 0 */
3290SN/A    do {
3300SN/A        if ( !refcnt ) {
3310SN/A            return refcnt;
3320SN/A        }
3330SN/A
3340SN/A        new_refcnt = refcnt - 1;
3350SN/A    } while ( !__atomic_compare_exchange( refp, &refcnt, &new_refcnt, 0,
3360SN/A            __ATOMIC_RELEASE, __ATOMIC_RELAXED ) );
3370SN/A    assert( new_refcnt == refcnt - 1 );
3380SN/A
3390SN/A    if ( !new_refcnt ) {
3400SN/A        epoch_append( object, cb );
3410SN/A    }
3420SN/A
3430SN/A    return refcnt;
3440SN/A}
3450SN/A