Interesting. I think you can vectorize the prologue using movemask + popcnt inst...

Interesting. I think you can vectorize the prologue using movemask + popcnt instead of keeping a counter in the ymm registers (warning: untested code, still need to benchmark it):

    const __m256i zero = _mm256_setzero_si256();
    const __m256i s = _mm256_set1_epi8( 's' );
    const __m256i p = _mm256_set1_epi8( 'p' );

    const size_t a = (size_t)input;
    const size_t rem = a % 32;
    const char* aligned = input - rem;

    const __m256i v = _mm256_load_si256(( const __m256i*) input);
    const __m256i z = _mm256_cmpeq_epi8( v, zero );

    size_t m_plus = _mm256_movemask_epi8(_mm_cmpeq_epi8(v, s));
    size_t m_minus = _mm256_movemask_epi8(_mm_cmpeq_epi8(v, p));
    size_t m_zero = _mm256_movemask_epi8(_mm_cmpeq_epi8(v, z));
    size_t offset_zero = _mm_tzcnt_64(m_zero >> rem);

    m_plus = _bzhi_u64(m_plus >> rem, offset_zero);
    m_minus = _bzhi_u64(m_minus >> rem, offset_zero);

    // Skip loop we already found the end of the string...
    while (m_zero == 0) {
        // ...
    }
    
    // ...
    
    return m_plus + res - m_minus;