All of these tricks are intended to save a factor of at least two (and often ten or more) by local changes to code. Of course algorithmic improvement can provide much greater efficiency gains, so I am assuming that you have done all you can in this respect, BEFORE using these tricks.
int parity(unsigned8 x) {
  return (((x * 0x10101) & 011111111) * 011111111) >> 21 & 1;
  /* MULT makes 3 copies of x, then AND re-selects each bit in `x' only */
  /* once (separated).  MULT adds up all such in bit 21 (cf long        */
  /* multiplication); OK since there can be no carry in to bit 21.      */
}
int haszerobyte(unsigned32 x) {
  return ((x - 0x01010101) & (~x) & 0x80808080) != 0;
}
int haszerobyte(unsigned64 x) {
  return ((x - 0x010101010101010) & (~x) & 0x8080808080808080) != 0;
}
int count_ones(unsigned36 x) {
  return ((((x * 01001001001) & 0x111111111) % 15;
}
if A is a 9 bit quantity, B gets number of 1's (Schroeppel)
        IMUL A,[1001001001]     ;4 copies
        AND A,[42104210421]     ;every 4th bit
        IDIVI A,17              ;casting out 15.'s in hexadecimal
;if A is 6 bit quantity, B gets 6 bits reversed (Schroeppel)
        IMUL A,[2020202]        ;4 copies shifted
        AND A,[104422010]       ;where bits coincide with reverse repeated base 2^8
        IDIVI A,377             ;casting out 2^8 - 1's
;reverse 7 bits (Schroeppel)
        IMUL A,[10004002001]    ;4 copies sep by 000's base 2 (may set arith. o'flow)
        AND A,[210210210010]    ;where bits coincide with reverse repeated base 2^8
        IDIVI A,377             ;casting out 377's
;reverse 8 bits (Schroeppel)
        MUL A,[100200401002]    ;5 copies in A and B
        AND B,[20420420020]     ;where bits coincide with reverse repeated base 2^10
        ANDI A,41               ;"
        DIVI A,1777             ;casting out 2^10 - 1's