Interesting. I think you can vectorize the prologue using movemask + popcnt instead of keeping a counter in the ymm registers (warning: untested code, still need to benchmark it):
const __m256i zero = _mm256_setzero_si256();
const __m256i s = _mm256_set1_epi8( 's' );
const __m256i p = _mm256_set1_epi8( 'p' );
const size_t a = (size_t)input;
const size_t rem = a % 32;
const char* aligned = input - rem;
const __m256i v = _mm256_load_si256(( const __m256i*) input);
const __m256i z = _mm256_cmpeq_epi8( v, zero );
size_t m_plus = _mm256_movemask_epi8(_mm_cmpeq_epi8(v, s));
size_t m_minus = _mm256_movemask_epi8(_mm_cmpeq_epi8(v, p));
size_t m_zero = _mm256_movemask_epi8(_mm_cmpeq_epi8(v, z));
size_t offset_zero = _mm_tzcnt_64(m_zero >> rem);
m_plus = _bzhi_u64(m_plus >> rem, offset_zero);
m_minus = _bzhi_u64(m_minus >> rem, offset_zero);
// Skip loop we already found the end of the string...
while (m_zero == 0) {
// ...
}
// ...
return m_plus + res - m_minus;