intel mkl

MKL 教程

intel mkl

安装

在intel官网注册并下载mkl:https://software.intel.com/en-us/mkl
Linux下安装:

wget http://registrationcenter-download.intel.com/akdlm/irc_nas/tec/14895/l_mkl_2019.1.144.tgz
tar -zxvf l_mkl_2019.1.144.tgz
cd l_mkl_2019.1.144/
./install.sh
sudo vim /etc/ld.so.conf.d/intel-mkl.conf
    /path/intel/mkl/lib/intel64
    /path/intel/lib/intel64
sudo ldconfig
cd /path/intel/mkl/bin
source mklvars.sh intel64  # https://software.intel.com/en-us/mkl-linux-developer-guide-scripts-to-set-environment-variables
vim dgemm_example.c  # input your code
gcc -o run_dgemm_example dgemm_example.c -lmkl_rt

实例

第一次入门教程:https://software.intel.com/en-us/mkl-tutorial-c-overview
所有实例:https://software.intel.com/en-us/product-code-samples

wget https://software.intel.com/sites/default/files/ipsxe2019_samples_lin_20180731.tgz
mkdir ipsxe2019_samples_lin_20180731
tar -zxvf ipsxe2019_samples_lin_20180731.tgz -C ipsxe2019_samples_lin_20180731


(1)源码:dgemm_example.c

#include <stdio.h>
#include <stdlib.h>

#include "mkl.h"

#define min(x,y) (((x) < (y)) ? (x) : (y))

int main()
{
    double *A, *B, *C;
    int m, n, k, i, j;
    double alpha, beta;

    printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
            " Intel(R) MKL function dgemm, where A, B, and  C are matrices and \n"
            " alpha and beta are double precision scalars\n\n");

    m = 2000, k = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, k, k, n);
    alpha = 1.0; beta = 0.0;

    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*k*sizeof( double ), 64 );
    B = (double *)mkl_malloc( k*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*k); i++) {
        A[i] = (double)(i+1);
    }

    for (i = 0; i < (k*n); i++) {
        B[i] = (double)(-i-1);
    }

    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;
    }

    printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k, alpha, A, k, B, n, beta, C, n);
    printf ("\n Computations completed.\n\n");

    printf (" Top left corner of matrix A: \n");
    for (i=0; i<min(m,6); i++) {
        for (j=0; j<min(k,6); j++) {
            printf ("%12.0f", A[j+i*k]);
        }
        printf ("\n");
    }

    printf ("\n Top left corner of matrix B: \n");
    for (i=0; i<min(k,6); i++) {
        for (j=0; j<min(n,6); j++) {
            printf ("%12.0f", B[j+i*n]);
        }
        printf ("\n");
    }

    printf ("\n Top left corner of matrix C: \n");
    for (i=0; i<min(m,6); i++) {
        for (j=0; j<min(n,6); j++) {
            printf ("%12.5G", C[j+i*n]);
        }
        printf ("\n");
    }

    printf ("\n Deallocating memory \n\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    printf (" Example completed. \n\n");
    return 0;
}

运行结果如下:

$ ./run_dgemm_example

 This example computes real matrix C=alpha*A*B+beta*C using
 Intel(R) MKL function dgemm, where A, B, and  C are matrices and
 alpha and beta are double precision scalars

 Initializing data for matrix multiplication C=A*B for matrix
 A(2000x200) and matrix B(200x1000)

 Allocating memory for matrices aligned on 64-byte boundary for better
 performance

 Intializing matrix data

 Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface


 Computations completed.

 Top left corner of matrix A:
           1           2           3           4           5           6
         201         202         203         204         205         206
         401         402         403         404         405         406
         601         602         603         604         605         606
         801         802         803         804         805         806
        1001        1002        1003        1004        1005        1006

 Top left corner of matrix B:
          -1          -2          -3          -4          -5          -6
       -1001       -1002       -1003       -1004       -1005       -1006
       -2001       -2002       -2003       -2004       -2005       -2006
       -3001       -3002       -3003       -3004       -3005       -3006
       -4001       -4002       -4003       -4004       -4005       -4006
       -5001       -5002       -5003       -5004       -5005       -5006

 Top left corner of matrix C:
 -2.6666E+09 -2.6666E+09 -2.6667E+09 -2.6667E+09 -2.6667E+09 -2.6667E+09
 -6.6467E+09 -6.6467E+09 -6.6468E+09 -6.6468E+09 -6.6469E+09  -6.647E+09
 -1.0627E+10 -1.0627E+10 -1.0627E+10 -1.0627E+10 -1.0627E+10 -1.0627E+10
 -1.4607E+10 -1.4607E+10 -1.4607E+10 -1.4607E+10 -1.4607E+10 -1.4607E+10
 -1.8587E+10 -1.8587E+10 -1.8587E+10 -1.8587E+10 -1.8588E+10 -1.8588E+10
 -2.2567E+10 -2.2567E+10 -2.2567E+10 -2.2567E+10 -2.2568E+10 -2.2568E+10

 Deallocating memory

 Example completed.


(2)源码:dgemm_with_timing.c

#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"

/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10

int main()
{
    double *A, *B, *C;
    int m, n, p, i, r;
    double alpha, beta;
    double s_initial, s_elapsed;

    printf ("\n This example measures performance of Intel(R) MKL function dgemm \n"
            " computing real matrix C=alpha*A*B+beta*C, where A, B, and C \n"
            " are matrices and alpha and beta are double precision scalars\n\n");

    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;

    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
        A[i] = (double)(i+1);
    }

    for (i = 0; i < (p*n); i++) {
        B[i] = (double)(-i-1);
    }

    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;
    }

    printf (" Making the first run of matrix product using Intel(R) MKL dgemm function \n"
            " via CBLAS interface to get stable run time measurements \n\n");
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, p, alpha, A, p, B, n, beta, C, n);

    printf (" Measuring performance of matrix product using Intel(R) MKL dgemm function \n"
            " via CBLAS interface \n\n");
    s_initial = dsecnd();
    for (r = 0; r < LOOP_COUNT; r++) {
        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                    m, n, p, alpha, A, p, B, n, beta, C, n);
    }
    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

    printf (" == Matrix multiplication using Intel(R) MKL dgemm completed == \n"
            " == at %.5f milliseconds == \n\n", (s_elapsed * 1000));

    printf (" Deallocating memory \n\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    if (s_elapsed < 0.9/LOOP_COUNT) {
        s_elapsed=1.0/LOOP_COUNT/s_elapsed;
        i=(int)(s_elapsed*LOOP_COUNT)+1;
        printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
               " computer as %i to have total execution time about 1 second for reliability \n"
               " of measurements\n\n", i);
    }

    printf (" Example completed. \n\n");
    return 0;
}

运行结果如下:

$ ./run_dgemm_with_timing

 This example measures performance of Intel(R) MKL function dgemm
 computing real matrix C=alpha*A*B+beta*C, where A, B, and C
 are matrices and alpha and beta are double precision scalars

 Initializing data for matrix multiplication C=A*B for matrix
 A(2000x200) and matrix B(200x1000)

 Allocating memory for matrices aligned on 64-byte boundary for better
 performance

 Intializing matrix data

 Making the first run of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface to get stable run time measurements

 Measuring performance of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface

 == Matrix multiplication using Intel(R) MKL dgemm completed ==
 == at 4.53907 milliseconds ==

 Deallocating memory

 It is highly recommended to define LOOP_COUNT for this example on your
 computer as 221 to have total execution time about 1 second for reliability
 of measurements

 Example completed.


(3)源码:dgemm_threading_effect_example.c

#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"

/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10

int main()
{
    double *A, *B, *C;
    int m, n, p, i, j, r, max_threads;
    double alpha, beta;
    double s_initial, s_elapsed;

    printf ("\n This example demonstrates threading impact on computing real matrix product \n"
            " C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are \n"
            " matrices and alpha and beta are double precision scalars \n\n");

    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;

    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
        A[i] = (double)(i+1);
    }

    for (i = 0; i < (p*n); i++) {
        B[i] = (double)(-i-1);
    }

    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;
    }

    printf (" Finding max number of threads Intel(R) MKL can use for parallel runs \n\n");
    max_threads = mkl_get_max_threads();

    printf (" Running Intel(R) MKL from 1 to %i threads \n\n", max_threads);
    for (i = 1; i <= max_threads; i++) {
        for (j = 0; j < (m*n); j++)
            C[j] = 0.0;

        printf (" Requesting Intel(R) MKL to use %i thread(s) \n\n", i);
        mkl_set_num_threads(i);

        printf (" Making the first run of matrix product using Intel(R) MKL dgemm function \n"
                " via CBLAS interface to get stable run time measurements \n\n");
        cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                    m, n, p, alpha, A, p, B, n, beta, C, n);

        printf (" Measuring performance of matrix product using Intel(R) MKL dgemm function \n"
                " via CBLAS interface on %i thread(s) \n\n", i);
        s_initial = dsecnd();
        for (r = 0; r < LOOP_COUNT; r++) {
            cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                        m, n, p, alpha, A, p, B, n, beta, C, n);
        }
        s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

        printf (" == Matrix multiplication using Intel(R) MKL dgemm completed ==\n"
                " == at %.5f milliseconds using %d thread(s) ==\n\n", (s_elapsed * 1000), i);
    }

    printf (" Deallocating memory \n\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    if (s_elapsed < 0.9/LOOP_COUNT) {
        s_elapsed=1.0/LOOP_COUNT/s_elapsed;
        i=(int)(s_elapsed*LOOP_COUNT)+1;
        printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
               " computer as %i to have total execution time about 1 second for reliability \n"
               " of measurements\n\n", i);
    }

    printf (" Example completed. \n\n");
    return 0;
}

运行结果如下:

$ ./run_dgemm_threading_effect_example

 This example demonstrates threading impact on computing real matrix product
 C=alpha*A*B+beta*C using Intel(R) MKL function dgemm, where A, B, and C are
 matrices and alpha and beta are double precision scalars

 Initializing data for matrix multiplication C=A*B for matrix
 A(2000x200) and matrix B(200x1000)

 Allocating memory for matrices aligned on 64-byte boundary for better
 performance

 Intializing matrix data

 Finding max number of threads Intel(R) MKL can use for parallel runs

 Running Intel(R) MKL from 1 to 4 threads

 Requesting Intel(R) MKL to use 1 thread(s)

 Making the first run of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface to get stable run time measurements

 Measuring performance of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface on 1 thread(s)

 == Matrix multiplication using Intel(R) MKL dgemm completed ==
 == at 15.35492 milliseconds using 1 thread(s) ==

 Requesting Intel(R) MKL to use 2 thread(s)

 Making the first run of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface to get stable run time measurements

 Measuring performance of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface on 2 thread(s)

 == Matrix multiplication using Intel(R) MKL dgemm completed ==
 == at 7.89382 milliseconds using 2 thread(s) ==

 Requesting Intel(R) MKL to use 3 thread(s)

 Making the first run of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface to get stable run time measurements

 Measuring performance of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface on 3 thread(s)

 == Matrix multiplication using Intel(R) MKL dgemm completed ==
 == at 5.45093 milliseconds using 3 thread(s) ==

 Requesting Intel(R) MKL to use 4 thread(s)

 Making the first run of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface to get stable run time measurements

 Measuring performance of matrix product using Intel(R) MKL dgemm function
 via CBLAS interface on 4 thread(s)

 == Matrix multiplication using Intel(R) MKL dgemm completed ==
 == at 4.42070 milliseconds using 4 thread(s) ==

 Deallocating memory

 It is highly recommended to define LOOP_COUNT for this example on your
 computer as 227 to have total execution time about 1 second for reliability
 of measurements

 Example completed.


(4)源码:matrix_multiplication.c

#define min(x,y) (((x) < (y)) ? (x) : (y))

#include <stdio.h>
#include <stdlib.h>
#include "mkl.h"

/* Consider adjusting LOOP_COUNT based on the performance of your computer */
/* to make sure that total run time is at least 1 second */
#define LOOP_COUNT 10

int main()
{
    double *A, *B, *C;
    int m, n, p, i, j, k, r;
    double alpha, beta;
    double sum;
    double s_initial, s_elapsed;

    printf ("\n This example measures performance of rcomputing the real matrix product \n"
            " C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are \n"
            " matrices and alpha and beta are double precision scalars \n\n");

    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;

    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        mkl_free(A);
        mkl_free(B);
        mkl_free(C);
        return 1;
    }

    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
        A[i] = (double)(i+1);
    }

    for (i = 0; i < (p*n); i++) {
        B[i] = (double)(-i-1);
    }

    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;
    }

    printf (" Making the first run of matrix product using triple nested loop\n"
            " to get stable run time measurements \n\n");
    for (i = 0; i < m; i++) {
        for (j = 0; j < n; j++) {
            sum = 0.0;
            for (k = 0; k < p; k++)
                sum += A[p*i+k] * B[n*k+j];
            C[n*i+j] = sum;
        }
    }

    printf (" Measuring performance of matrix product using triple nested loop \n\n");
    s_initial = dsecnd();
    for (r = 0; r < LOOP_COUNT; r++) {
        for (i = 0; i < m; i++) {
            for (j = 0; j < n; j++) {
                sum = 0.0;
                for (k = 0; k < p; k++)
                    sum += A[p*i+k] * B[n*k+j];
                C[n*i+j] = sum;
            }
        }
    }
    s_elapsed = (dsecnd() - s_initial) / LOOP_COUNT;

    printf (" == Matrix multiplication using triple nested loop completed == \n"
            " == at %.5f milliseconds == \n\n", (s_elapsed * 1000));

    printf (" Deallocating memory \n\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);

    if (s_elapsed < 0.9/LOOP_COUNT) {
        s_elapsed=1.0/LOOP_COUNT/s_elapsed;
        i=(int)(s_elapsed*LOOP_COUNT)+1;
        printf(" It is highly recommended to define LOOP_COUNT for this example on your \n"
               " computer as %i to have total execution time about 1 second for reliability \n"
               " of measurements\n\n", i);
    }

    printf (" Example completed. \n\n");
    return 0;
}

运行结果如下:

$ ./run_matrix_multiplication

 This example measures performance of rcomputing the real matrix product
 C=alpha*A*B+beta*C using a triple nested loop, where A, B, and C are
 matrices and alpha and beta are double precision scalars

 Initializing data for matrix multiplication C=A*B for matrix
 A(2000x200) and matrix B(200x1000)

 Allocating memory for matrices aligned on 64-byte boundary for better
 performance

 Intializing matrix data

 Making the first run of matrix product using triple nested loop
 to get stable run time measurements

 Measuring performance of matrix product using triple nested loop

 == Matrix multiplication using triple nested loop completed ==
 == at 1408.21425 milliseconds ==

 Deallocating memory

 Example completed.