- 追加された行はこのように表示されます。
- 削除された行は
このように表示されます。
!VitisとHBM(続)
複数HBMリージョンにアクセスしてみる.bundleに気をつけて,
27個のHBMリージョンにアクセス.
ちなみに,もう一組増やして30個利用しようとしたらP&Rで配線できないというエラーがでた.
extern "C" {
void vadd(int count,
int* a_0, int* b_0, int* c_0,
int* a_1, int* b_1, int* c_1,
int* a_2, int* b_2, int* c_2,
int* a_3, int* b_3, int* c_3,
int* a_4, int* b_4, int* c_4,
int* a_5, int* b_5, int* c_5,
int* a_6, int* b_6, int* c_6,
int* a_7, int* b_7, int* c_7,
int* a_8, int* b_8, int* c_8);
}
void vadd(int count,
int* a_0, int* b_0, int* c_0,
int* a_1, int* b_1, int* c_1,
int* a_2, int* b_2, int* c_2,
int* a_3, int* b_3, int* c_3,
int* a_4, int* b_4, int* c_4,
int* a_5, int* b_5, int* c_5,
int* a_6, int* b_6, int* c_6,
int* a_7, int* b_7, int* c_7,
int* a_8, int* b_8, int* c_8)
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi port=a_0 offset=slave bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi port=b_0 offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi port=c_0 offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_1 offset=slave bundle=gmem3
#pragma HLS INTERFACE s_axilite port=a_1 bundle=control
#pragma HLS INTERFACE m_axi port=b_1 offset=slave bundle=gmem4
#pragma HLS INTERFACE s_axilite port=b_1 bundle=control
#pragma HLS INTERFACE m_axi port=c_1 offset=slave bundle=gmem5
#pragma HLS INTERFACE s_axilite port=c_1 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_2 offset=slave bundle=gmem6
#pragma HLS INTERFACE s_axilite port=a_2 bundle=control
#pragma HLS INTERFACE m_axi port=b_2 offset=slave bundle=gmem7
#pragma HLS INTERFACE s_axilite port=b_2 bundle=control
#pragma HLS INTERFACE m_axi port=c_2 offset=slave bundle=gmem8
#pragma HLS INTERFACE s_axilite port=c_2 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_3 offset=slave bundle=gmem9
#pragma HLS INTERFACE s_axilite port=a_3 bundle=control
#pragma HLS INTERFACE m_axi port=b_3 offset=slave bundle=gmem10
#pragma HLS INTERFACE s_axilite port=b_3 bundle=control
#pragma HLS INTERFACE m_axi port=c_3 offset=slave bundle=gmem11
#pragma HLS INTERFACE s_axilite port=c_3 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_4 offset=slave bundle=gmem12
#pragma HLS INTERFACE s_axilite port=a_4 bundle=control
#pragma HLS INTERFACE m_axi port=b_4 offset=slave bundle=gmem13
#pragma HLS INTERFACE s_axilite port=b_4 bundle=control
#pragma HLS INTERFACE m_axi port=c_4 offset=slave bundle=gmem14
#pragma HLS INTERFACE s_axilite port=c_4 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_5 offset=slave bundle=gmem15
#pragma HLS INTERFACE s_axilite port=a_5 bundle=control
#pragma HLS INTERFACE m_axi port=b_5 offset=slave bundle=gmem16
#pragma HLS INTERFACE s_axilite port=b_5 bundle=control
#pragma HLS INTERFACE m_axi port=c_5 offset=slave bundle=gmem17
#pragma HLS INTERFACE s_axilite port=c_5 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_6 offset=slave bundle=gmem18
#pragma HLS INTERFACE s_axilite port=a_6 bundle=control
#pragma HLS INTERFACE m_axi port=b_6 offset=slave bundle=gmem19
#pragma HLS INTERFACE s_axilite port=b_6 bundle=control
#pragma HLS INTERFACE m_axi port=c_6 offset=slave bundle=gmem20
#pragma HLS INTERFACE s_axilite port=c_6 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_7 offset=slave bundle=gmem21
#pragma HLS INTERFACE s_axilite port=a_7 bundle=control
#pragma HLS INTERFACE m_axi port=b_7 offset=slave bundle=gmem22
#pragma HLS INTERFACE s_axilite port=b_7 bundle=control
#pragma HLS INTERFACE m_axi port=c_7 offset=slave bundle=gmem23
#pragma HLS INTERFACE s_axilite port=c_7 bundle=control
//
#pragma HLS INTERFACE m_axi port=a_8 offset=slave bundle=gmem24
#pragma HLS INTERFACE s_axilite port=a_8 bundle=control
#pragma HLS INTERFACE m_axi port=b_8 offset=slave bundle=gmem25
#pragma HLS INTERFACE s_axilite port=b_8 bundle=control
#pragma HLS INTERFACE m_axi port=c_8 offset=slave bundle=gmem26
#pragma HLS INTERFACE s_axilite port=c_8 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control
for(int i = 0; i < count; i++){
#pragma HLS PIPELINE
c_0[i] = a_0[i] + b_0[i];
c_1[i] = a_1[i] + b_1[i];
c_2[i] = a_2[i] + b_2[i];
c_3[i] = a_3[i] + b_3[i];
c_4[i] = a_4[i] + b_4[i];
c_5[i] = a_5[i] + b_5[i];
c_6[i] = a_6[i] + b_6[i];
c_7[i] = a_7[i] + b_7[i];
c_8[i] = a_8[i] + b_8[i];
}
}
こんなコードを用意.design.cfgは,
platform=xilinx_u50_gen3x16_xdma_201920_3
debug=1
profile_kernel=data:all:all:all
[connectivity]
nk=vadd:1:vadd_1
sp=vadd_1.a_0:HBM[0]
sp=vadd_1.b_0:HBM[1]
sp=vadd_1.c_0:HBM[2]
sp=vadd_1.a_1:HBM[3]
sp=vadd_1.b_1:HBM[4]
sp=vadd_1.c_1:HBM[5]
sp=vadd_1.a_2:HBM[6]
sp=vadd_1.b_2:HBM[7]
sp=vadd_1.c_2:HBM[8]
sp=vadd_1.a_3:HBM[9]
sp=vadd_1.b_3:HBM[10]
sp=vadd_1.c_3:HBM[11]
sp=vadd_1.a_4:HBM[12]
sp=vadd_1.b_4:HBM[13]
sp=vadd_1.c_4:HBM[14]
sp=vadd_1.a_5:HBM[15]
sp=vadd_1.b_5:HBM[16]
sp=vadd_1.c_5:HBM[17]
sp=vadd_1.a_6:HBM[18]
sp=vadd_1.b_6:HBM[19]
sp=vadd_1.c_6:HBM[20]
sp=vadd_1.a_7:HBM[21]
sp=vadd_1.b_7:HBM[22]
sp=vadd_1.c_7:HBM[23]
sp=vadd_1.a_8:HBM[24]
sp=vadd_1.b_8:HBM[25]
sp=vadd_1.c_8:HBM[26]
ビルドして実行.
{{ref_image vadd_multi_arch.png}}
という感じでHBMがたくさん並んだ.
リソース使用量はLUTとRegisterがそれぞれ6,453と9,417.BRAMとDSPは使ってない.
{{ref_image vadd_multi_profiler.png}}
実行時のメモリバンド幅をみてみると,それぞれ1GBpsを越える性能がでてる.よかった.
512bit幅で読み書きするように修正.
const int num = count / 16;
ap_uint<512> tmp_a_0, tmp_b_0, tmp_c_0;
for(int i = 0; i < num; i++){
#pragma HLS PIPELINE II=1
tmp_a_0 = a_0[i];
tmp_b_0 = b_0[i];
for(int j = 0; j < 16; j++){
tmp_c_0(j*32+31, j*32) = tmp_a_0.range(j*32+31,j*32) + tmp_b_0.range(j*32+31,j*32);
}
c_0[i] = tmp_c_0;
}
バンド幅は8GBps〜10.5GBpsっていう感じ
{{ref_image vadd_multi_512_profiler.png}}
リソース使用量は,LUTとレジスタが,それぞれ,49,778個と111,190個.
またBRAMを207個(15.4%相当)利用している.
:: 複数のHBMリージョンを使う.
C++コードはかわらず,たとえば,
extern "C" {
void vadd(int count,
int* a_0, int* b_0, int* c_0
);
}
void vadd(int count,
int* a_0, int* b_0, int* c_0
)
{
#pragma HLS INTERFACE s_axilite port=count bundle=control
//
#pragma HLS INTERFACE m_axi port=a_0 offset=slave bundle=gmem0
#pragma HLS INTERFACE s_axilite port=a_0 bundle=control
#pragma HLS INTERFACE m_axi port=b_0 offset=slave bundle=gmem1
#pragma HLS INTERFACE s_axilite port=b_0 bundle=control
#pragma HLS INTERFACE m_axi port=c_0 offset=slave bundle=gmem2
#pragma HLS INTERFACE s_axilite port=c_0 bundle=control
//
#pragma HLS INTERFACE s_axilite port=return bundle=control
for(int i = 0; i < count; i++){
#pragma HLS PIPELINE
c_0[i] = a_0[i] + b_0[i];
}
}
で,design.cfgで,
platform=xilinx_u50_gen3x16_xdma_201920_3
debug=1
profile_kernel=data:all:all:all
[connectivity]
nk=vadd:1:vadd_1
sp=vadd_1.a_0:HBM[0:1]
sp=vadd_1.b_0:HBM[2:3]
sp=vadd_1.c_0:HBM[4:5]
とかする.
{{ref_image vadd_2_multi_hbm_arch.png}}
という感じに複数のHBMリージョンをぶらさげることができる.