Performance of Intel(R) Streaming SIMD Extensions Intrinsics

Regular Intel® Streaming SIMD Extensions (Intel® SSE) intrinsics work on four 32-bit single precision values. On IA-64 architecture-based systems, basic operations like add and compare require two SIMD instructions. All these operations can be executed in the same cycle so the throughput is one basic SSE operation per cycle or four 32-bit single precision operations per cycle.

Key to the table entries

Intrinsic
Name

MMX(TM)
Technology

Intel® SSE
Intel® SSE2

IA-64
Architecture

_mm_add_ss

N/A

B

B

_mm_add_ps

N/A

A

A

_mm_sub_ss

N/A

B

B

_mm_sub_ps

N/A

A

A

_mm_mul_ss

N/A

B

B

_mm_mul_ps

N/A

A

A

_mm_div_ss

N/A

B

B

_mm_div_ps

N/A

A

A

_mm_sqrt_ss

N/A

B

B

_mm_sqrt_ps

N/A

A

A

_mm_rcp_ss

N/A

B

B

_mm_rcp_ps

N/A

A

A

_mm_rsqrt_ss

N/A

B

B

_mm_rsqrt_ps

N/A

A

A

_mm_min_ss

N/A

B

B

_mm_min_ps

N/A

A

A

_mm_max_ss

N/A

B

B

_mm_max_ps

N/A

A

A

_mm_and_ps

N/A

A

A

_mm_andnot_ps

N/A

A

A

_mm_or_ps

N/A

A

A

_mm_xor_ps

N/A

A

A

_mm_cmpeq_ss

N/A

B

B

_mm_cmpeq_ps

N/A

A

A

_mm_cmplt_ss

N/A

B

B

_mm_cmplt_ps

N/A

A

A

_mm_cmple_ss

N/A

B

B

_mm_cmple_ps

N/A

A

A

_mm_cmpgt_ss

N/A

B

B

_mm_cmpgt_ps

N/A

A

A

_mm_cmpge_ss

N/A

B

B

_mm_cmpge_ps

N/A

A

A

_mm_cmpneq_ss

N/A

B

B

_mm_cmpneq_ps

N/A

A

A

_mm_cmpnlt_ss

N/A

B

B

_mm_cmpnlt_ps

N/A

A

A

_mm_cmpnle_ss

N/A

B

B

_mm_cmpnle_ps

N/A

A

A

_mm_cmpngt_ss

N/A

B

B

_mm_cmpngt_ps

N/A

A

A

_mm_cmpnge_ss

N/A

B

B

_mm_cmpnge_ps

N/A

A

A

_mm_cmpord_ss

N/A

B

B

_mm_cmpord_ps

N/A

A

A

_mm_cmpunord_ss

N/A

B

B

_mm_cmpunord_ps

N/A

A

A

_mm_comieq_ss

N/A

B

B

_mm_comilt_ss

N/A

B

B

_mm_comile_ss

N/A

B

B

_mm_comigt_ss

N/A

B

B

_mm_comige_ss

N/A

B

B

_mm_comineq_ss

N/A

B

B

_mm_ucomieq_ss

N/A

B

B

_mm_ucomilt_ss

N/A

B

B

_mm_ucomile_ss

N/A

B

B

_mm_ucomigt_ss

N/A

B

B

_mm_ucomige_ss

N/A

B

B

_mm_ucomineq_ss

N/A

B

B

_mm_cvtss_si32

N/A

A

B

_mm_cvtps_pi32

N/A

A

A

_mm_cvttss_si32

N/A

A

B

_mm_cvttps_pi32

N/A

A

A

_mm_cvtsi32_ss

N/A

A

B

_mm_cvtpi32_ps

N/A

A

C

_mm_cvtpi16_ps

N/A

A

C

_mm_cvtpu16_ps

N/A

A

C

_mm_cvtpi8_ps

N/A

A

C

_mm_cvtpu8_ps

N/A

A

C

_mm_cvtpi32x2_ps

N/A

A

C

_mm_cvtps_pi16

N/A

A

C

_mm_cvtps_pi8

N/A

A

C

_mm_move_ss

N/A

A

A

_mm_shuffle_ps

N/A

A

A

_mm_unpackhi_ps

N/A

A

A

_mm_unpacklo_ps

N/A

A

A

_mm_movehl_ps

N/A

A

A

_mm_movelh_ps

N/A

A

A

_mm_movemask_ps

N/A

A

C

_mm_getcsr

N/A

A

A

_mm_setcsr

N/A

A

A

_mm_loadh_pi

N/A

A

A

_mm_loadl_pi

N/A

A

A

_mm_load_ss

N/A

A

B

_mm_load1_ps

N/A

A

A

_mm_load_ps

N/A

A

A

_mm_loadu_ps

N/A

A

A

_mm_loadr_ps

N/A

A

A

_mm_storeh_pi

N/A

A

A

_mm_storel_pi

N/A

A

A

_mm_store_ss

N/A

A

A

_mm_store_ps

N/A

A

A

_mm_store1_ps

N/A

A

A

_mm_storeu_ps

N/A

A

A

_mm_storer_ps

N/A

A

A

_mm_set_ss

N/A

A

A

_mm_set1_ps

N/A

A

A

_mm_set_ps

N/A

A

A

_mm_setr_ps

N/A

A

A

_mm_setzero_ps

N/A

A

A

_mm_prefetch

N/A

A

A

_mm_stream_pi

N/A

A

A

_mm_stream_ps

N/A

A

A

_mm_sfence

N/A

A

A

_mm_extract_pi16

N/A

A

A

_mm_insert_pi16

N/A

A

A

_mm_max_pi16

N/A

A

A

_mm_max_pu8

N/A

A

A

_mm_min_pi16

N/A

A

A

_mm_min_pu8

N/A

A

A

_mm_movemask_pi8

N/A

A

C

_mm_mulhi_pu16

N/A

A

A

_mm_shuffle_pi16

N/A

A

A

_mm_maskmove_si64

N/A

A

C

_mm_avg_pu8

N/A

A

A

_mm_avg_pu16

N/A

A

A

_mm_sad_pu8

N/A

A

A