준비물
.
├── cifar10
│ ├── build_cifar10_test.sh
│ ├── cifar10_labels.dat
│ ├── cifar10_performance.sh
│ ├── code
│ │ ├── build_app.sh
│ │ ├── build_get_dpu_fps.sh
│ │ └── src
│ │ ├── check_runtime_top5_cifar10.py
│ │ ├── get_dpu_fps.cc
│ │ └── main_int8.cc
│ ├── run_all_cifar10_target.sh
│ ├── v70_train1_resnet18_cifar10.xmodel
│ ├── v70_train2_resnet18_cifar10.xmodel
│ ├── vck190_train1_resnet18_cifar10.xmodel
│ ├── vck190_train2_resnet18_cifar10.xmodel
│ ├── vck5000_train1_resnet18_cifar10.xmodel
│ ├── vck5000_train2_resnet18_cifar10.xmodel
│ ├── vek280_train1_resnet18_cifar10.xmodel
│ ├── vek280_train2_resnet18_cifar10.xmodel
│ ├── zcu102_train1_resnet18_cifar10.xmodel
│ └── zcu102_train2_resnet18_cifar10.xmodel
├── common
│ ├── common.cpp
│ └── common.h
├── imagenet
│ ├── code_resnet50
│ │ ├── build_resnet50.sh
│ │ └── src
│ │ ├── check_runtime_top1_imagenet.py
│ │ ├── config
│ │ │ └── imagenet_config.py
│ │ └── main_resnet50.cc
│ ├── imagenet_performance.sh
│ ├── run_all_imagenet_target.sh
│ ├── v70_resnet18_imagenet.xmodel
│ ├── val_dataset.zip
│ ├── vck190_resnet18_imagenet.xmodel
│ ├── vck5000_resnet18_imagenet.xmodel
│ ├── vek280_resnet18_imagenet.xmodel
│ └── zcu102_resnet18_imagenet.xmodel
└── run_all_target.sh
8 directories, 34 files
코드 실행
root@xilinx-vek280-es1-20231:~/target_vek280
> ./run_all_target.sh vek280
target/cifar10/run_all_cifar10_target.sh/main()
main()
{
clean_cif10
compile_cif10
test_images_cif10
run_cnn_cif10
end_cif10
}
target/run_all_target.sh
#!/bin/bash
source ./cifar10/run_all_cifar10_target.sh main $1
source ./imagenet/run_all_imagenet_target.sh main $1
target/cifar10/run_all_cifar10_target.sh/clean_cif10()
clean_cif10(){
echo " "
echo "clean cifar10"
echo " "
cd cifar10
rm -rf test
rm -f *~
rm -f run_cnn cnn* get_dpu_fps *.txt
rm -rf rpt
rm -f *.txt
rm -f *.log
mkdir -p rpt
cd ..
}
clean cifar10
target/cifar10/run_all_cifar10_target.sh/compile_cif10() 앞부분
compile_cif10(){
echo " "
echo "compile cifar10"
echo " "
cd cifar10/code
echo "PWD1 = " $PWD
bash -x ./build_app.sh
mv code ../cnn_resnet18_cifar10
bash -x ./build_get_dpu_fps.sh
mv code ../get_dpu_fps
cd ../..
target/cifar10/code/build_app.sh
#!/bin/sh # 쉘 스크립트의 시작을 알리는 shebang
cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1
CXX=${CXX:-g++}
os=`lsb_release -a | grep "Distributor ID" | sed 's/^.*:\s*//'`
os_version=`lsb_release -a | grep "Release" | sed 's/^.*:\s*//'`
arch=`uname -p`
target_info=${os}.${os_version}.${arch}
install_prefix_default=$HOME/.local/${target_info}
$CXX --version
result=0 && pkg-config --list-all | grep opencv4 && result=1
if [ $result -eq 1 ]; then
OPENCV_FLAGS=$(pkg-config --cflags --libs-only-L opencv4)
else
OPENCV_FLAGS=$(pkg-config --cflags --libs-only-L opencv)
fi
name=$(basename $PWD)
if [[ "$CXX" == *"sysroot"* ]];then
$CXX -O2 -fno-inline -I. \
-I=/usr/include/opencv4 \
-I=/install/Debug/include \
-I=/install/Release/include \
-L=/install/Debug/lib \
-L=/install/Release/lib \
-I$PWD/../../common -o $name -std=c++17 \
$PWD/src/main_int8.cc \
$PWD/../../common/common.cpp \
-Wl,-rpath=$PWD/lib \
-lvart-runner \
${OPENCV_FLAGS} \
-lopencv_videoio \
-lopencv_imgcodecs \
-lopencv_highgui \
-lopencv_imgproc \
-lopencv_core \
-lglog \
-lxir \
-lunilog \
-lpthread
else
$CXX -O2 -fno-inline -I. \
-I${install_prefix_default}.Debug/include \
-I${install_prefix_default}.Release/include \
-L${install_prefix_default}.Debug/lib \
-L${install_prefix_default}.Release/lib \
-Wl,-rpath=${install_prefix_default}.Debug/lib \
-Wl,-rpath=${install_prefix_default}.Release/lib \
-I$PWD/../../common -o $name -std=c++17 \
$PWD/src/main_int8.cc \
$PWD/../../common/common.cpp \
-Wl,-rpath=$PWD/lib \
-lvart-runner \
${OPENCV_FLAGS} \
-lopencv_videoio \
-lopencv_imgcodecs \
-lopencv_highgui \
-lopencv_imgproc \
-lopencv_core \
-lglog \
-lxir \
-lunilog \
-lpthread
fi
PWD1 = /home/root/target_vek280/cifar10/code
++ dirname ./build_app.sh
+ cd .
+ CXX=g++
++ sed 's/^.*:\s*//'
++ grep 'Distributor ID'
++ lsb_release -a
+ os=petalinux
++ grep Release
++ sed 's/^.*:\s*//'
++ lsb_release -a
+ os_version=2023.1+release-S05010539
++ uname -p
+ arch=unknown
+ target_info=petalinux.2023.1+release-S05010539.unknown
+ install_prefix_default=/home/root/.local/petalinux.2023.1+release-S05010539.unknown
+ g++ --version
g++ (GCC) 12.2.0
Copyright (C) 2022 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ result=0
+ pkg-config --list-all
+ grep opencv4
opencv4 OpenCV - Open Source Computer Vision Library
+ result=1
+ '[' 1 -eq 1 ']'
++ pkg-config --cflags --libs-only-L opencv4
+ OPENCV_FLAGS=-I/usr/include/opencv4
++ basename /home/root/target_vek280/cifar10/code
+ name=code
+ [[ g++ == *\s\y\s\r\o\o\t* ]]
+ g++ -O2 -fno-inline -I. -I/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/include -I/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/include -L/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/lib -L/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/lib -Wl,-rpath=/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/lib -Wl,-rpath=/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/lib -I/home/root/target_vek280/cifar10/code/../../common -o code -std=c++17 /home/root/target_vek280/cifar10/code/src/main_int8.cc /home/root/target_vek280/cifar10/code/../../common/common.cpp -Wl,-rpath=/home/root/target_vek280/cifar10/code/lib -lvart-runner -I/usr/include/opencv4 -lopencv_videoio -lopencv_imgcodecs -lopencv_highgui -lopencv_imgproc -lopencv_core -lglog -lxir -lunilog -lpthread
target/cifar10/code/build_get_dpu_fps.sh
#!/bin/bash # 쉘 스크립트의 시작을 알리는 shebang
cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1
CXX=${CXX:-g++}
os=`lsb_release -a | grep "Distributor ID" | sed 's/^.*:\s*//'`
os_version=`lsb_release -a | grep "Release" | sed 's/^.*:\s*//'`
arch=`uname -p`
target_info=${os}.${os_version}.${arch}
install_prefix_default=$HOME/.local/${target_info}
$CXX --version
result=0 && pkg-config --list-all | grep opencv4 && result=1
if [ $result -eq 1 ]; then
OPENCV_FLAGS=$(pkg-config --cflags --libs-only-L opencv4)
else
OPENCV_FLAGS=$(pkg-config --cflags --libs-only-L opencv)
fi
name=$(basename $PWD)
if [[ "$CXX" == *"sysroot"* ]];then
$CXX -O2 -fno-inline -I. \
-I=/usr/include/opencv4 \
-I=/install/Debug/include \
-I=/install/Release/include \
-L=/install/Debug/lib \
-L=/install/Release/lib \
-I$PWD/../../common -o $name -std=c++17 \
$PWD/src/get_dpu_fps.cc \
$PWD/../../common/common.cpp \
-Wl,-rpath=$PWD/lib \
-lvart-runner \
${OPENCV_FLAGS} \
-lopencv_videoio \
-lopencv_imgcodecs \
-lopencv_highgui \
-lopencv_imgproc \
-lopencv_core \
-lglog \
-lxir \
-lunilog \
-lpthread
else
$CXX -O2 -fno-inline -I. \
-I${install_prefix_default}.Debug/include \
-I${install_prefix_default}.Release/include \
-L${install_prefix_default}.Debug/lib \
-L${install_prefix_default}.Release/lib \
-Wl,-rpath=${install_prefix_default}.Debug/lib \
-Wl,-rpath=${install_prefix_default}.Release/lib \
-I$PWD/../../common -o $name -std=c++17 \
$PWD/src/get_dpu_fps.cc \
$PWD/../../common/common.cpp \
-Wl,-rpath=$PWD/lib \
-lvart-runner \
${OPENCV_FLAGS} \
-lopencv_videoio \
-lopencv_imgcodecs \
-lopencv_highgui \
-lopencv_imgproc \
-lopencv_core \
-lglog \
-lxir \
-lunilog \
-lpthread
fi
++ dirname ./build_get_dpu_fps.sh
+ cd .
+ CXX=g++
++ sed 's/^.*:\s*//'
++ lsb_release -a
++ grep 'Distributor ID'
+ os=petalinux
++ lsb_release -a
++ sed 's/^.*:\s*//'
++ grep Release
+ os_version=2023.1+release-S05010539
++ uname -p
+ arch=unknown
+ target_info=petalinux.2023.1+release-S05010539.unknown
+ install_prefix_default=/home/root/.local/petalinux.2023.1+release-S05010539.unknown
+ g++ --version
g++ (GCC) 12.2.0
Copyright (C) 2022 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ result=0
+ grep opencv4
+ pkg-config --list-all
opencv4 OpenCV - Open Source Computer Vision Library
+ result=1
+ '[' 1 -eq 1 ']'
++ pkg-config --cflags --libs-only-L opencv4
+ OPENCV_FLAGS=-I/usr/include/opencv4
++ basename /home/root/target_vek280/cifar10/code
+ name=code
+ [[ g++ == *\s\y\s\r\o\o\t* ]]
+ g++ -O2 -fno-inline -I. -I/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/include -I/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/include -L/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/lib -L/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/lib -Wl,-rpath=/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/lib -Wl,-rpath=/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/lib -I/home/root/target_vek280/cifar10/code/../../common -o code -std=c++17 /home/root/target_vek280/cifar10/code/src/get_dpu_fps.cc /home/root/target_vek280/cifar10/code/../../common/common.cpp -Wl,-rpath=/home/root/target_vek280/cifar10/code/lib -lvart-runner -I/usr/include/opencv4 -lopencv_videoio -lopencv_imgcodecs -lopencv_highgui -lopencv_imgproc -lopencv_core -lglog -lxir -lunilog -lpthread
/home/root/target_vek280/cifar10/code/src/get_dpu_fps.cc: In function 'void runDPU(vart::Runner*, int8_t*, int8_t*)':
/home/root/target_vek280/cifar10/code/src/get_dpu_fps.cc:47:45: warning: 'virtual const std::vector<int> xir::Tensor::get_dims() const' is deprecated: Tensor::get_dims() will be removed in the future version. Please use the Tensor::get_shape() instead. [-Wdeprecated-declarations]
47 | auto out_dims = outputTensors[0]->get_dims();
| ~~~~~~~~~~~~~~~~~~~~~~~~~~^~
In file included from /usr/include/vart/mm/host_flat_tensor_buffer.hpp:21,
from /home/root/target_vek280/cifar10/code/../../common/common.h:23,
from /home/root/target_vek280/cifar10/code/src/get_dpu_fps.cc:31:
/usr/include/xir/tensor/tensor.hpp:136:7: note: declared here
136 | get_dims() const = 0;
| ^~~~~~~~
/home/root/target_vek280/cifar10/code/src/get_dpu_fps.cc:48:43: warning: 'virtual const std::vector<int> xir::Tensor::get_dims() const' is deprecated: Tensor::get_dims() will be removed in the future version. Please use the Tensor::get_shape() instead. [-Wdeprecated-declarations]
48 | auto in_dims = inputTensors[0]->get_dims();
| ~~~~~~~~~~~~~~~~~~~~~~~~~^~
/usr/include/xir/tensor/tensor.hpp:136:7: note: declared here
136 | get_dims() const = 0;
| ^~~~~~~~
target/cifar10/run_all_cifar10_target.sh/compile_cif10() 뒷부분
echo "PWD2 = " $PWD
}
PWD2 = /home/root/target_vek280
target/cifar10/run_all_cifar10_target.sh/test_images_cif10()
test_images_cif10(){
echo " "
echo "build test images for cifar10"
echo " "
cd cifar10
bash ./build_cifar10_test.sh
cd ..
echo " "
echo "PWD3 = " $PWD
}
build test images for cifar10
PWD3 = /home/root/target_vek280
target/cifar10/build_cifar10_test.sh
#!/bin/bash
tar -xvf test.tar &> /dev/null
cd ./test
cd automobile
mv *.png ../
cd ..
rm -r automobile/
cd airplane
mv *.png ../
cd ..
rm -r airplane/
cd bird
mv *.png ../
cd ..
rm -r bird/
cd cat
mv *.png ../
cd ..
rm -r cat/
cd deer
mv *.png ../
cd ..
rm -r deer/
cd dog
mv *.png ../
cd ..
rm -r dog/
cd frog
mv *.png ../
cd ..
rm -r frog/
cd horse
mv *.png ../
cd ..
rm -r horse/
cd ship
mv *.png ../
cd ..
rm -r ship
cd truck
mv *.png ../
cd ..
rm -r truck
cd ..
target/cifar10/run_all_cifar10_target.sh/run_cnn_cif10() 앞부분
run_cnn_cif10(){
echo " "
echo " run cifar10 CNN"
echo " "
cd cifar10
./cnn_resnet18_cifar10 ./${TARGET}_train1_resnet18_cifar10.xmodel ./test/ ./cifar10_labels.dat | tee ./rpt/predictions_cifar10_resnet18.log
run cifar10 CNN
#include <assert.h>
#include <dirent.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <cassert>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <queue>
#include <string>
#include <vector>
#include "common.h"
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
GraphInfo shapes;
string baseImagePath, wordsPath;
void ListImages(string const& path, vector<string>& images) {
images.clear();
struct dirent* entry;
struct stat s;
lstat(path.c_str(), &s);
if (!S_ISDIR(s.st_mode)) {
fprintf(stderr, "Error: %s is not a valid directory!\n", path.c_str());
exit(1);
}
DIR* dir = opendir(path.c_str());
if (dir == nullptr) {
fprintf(stderr, "Error: Open %s path failed.\n", path.c_str());
exit(1);
}
while ((entry = readdir(dir)) != nullptr) {
if (entry->d_type == DT_REG || entry->d_type == DT_UNKNOWN) {
string name = entry->d_name;
string ext = name.substr(name.find_last_of(".") + 1);
if ((ext == "JPEG") || (ext == "jpeg") || (ext == "JPG") ||
(ext == "jpg") || (ext == "PNG") || (ext == "png")) {
images.push_back(name);
}
}
}
closedir(dir);
}
void LoadWords(string const& path, vector<string>& kinds) {
kinds.clear();
ifstream fkinds(path);
if (fkinds.fail()) {
fprintf(stderr, "Error : Open %s failed.\n", path.c_str());
exit(1);
}
string kind;
while (getline(fkinds, kind)) {
kinds.push_back(kind);
}
fkinds.close();
}
void CPUCalcSoftmax(const int8_t* data, size_t size, float* result, float scale) {
assert(data && result);
double sum = 0.0f;
for (size_t i = 0; i < size; i++) {
result[i] = exp((float)data[i] * scale);
sum += result[i];
}
for (size_t i = 0; i < size; i++) {
result[i] /= sum;
}
}
void ArgMax(const int8_t* data, size_t size, float *res_val, int *res_index, float scale)
{
int index = 0;
int8_t max = data[0];
for (size_t i = 1; i < size; i++)
{
if (data[i] > max)
{
max = data[i];
index = i;
}
}
*res_val = (float) (max * scale);
*res_index = index;
}
void TopK(const float* d, int size, int k, vector<string>& vkinds) {
assert(d && size > 0 && k > 0);
priority_queue<pair<float, int>> q;
for (auto i = 0; i < size; ++i) {
q.push(pair<float, int>(d[i], i));
}
for (auto i = 0; i < k; ++i) {
pair<float, int> ki = q.top();
printf("top[%d] prob = %-8f name = %s\n", i, d[ki.second],
vkinds[ki.second].c_str());
q.pop();
}
}
void run_CNN(vart::Runner* runner) {
vector<string> kinds, images;
ListImages(baseImagePath, images);
if (images.size() == 0) {
cerr << "\nError: No images existing under " << baseImagePath << endl;
return;
}
LoadWords(wordsPath, kinds);
if (kinds.size() == 0) {
cerr << "\nError: No words exist in file " << wordsPath << endl;
return;
}
auto outputTensors = runner->get_output_tensors();
auto inputTensors = runner->get_input_tensors();
auto out_dims = outputTensors[0]->get_shape();
auto in_dims = inputTensors[0]->get_shape();
auto input_scale = get_input_scale(inputTensors[0]);
auto output_scale = get_output_scale(outputTensors[0]);
int outSize = shapes.outTensorList[0].size;
int inSize = shapes.inTensorList[0].size;
int inHeight = shapes.inTensorList[0].height;
int inWidth = shapes.inTensorList[0].width;
int batchSize = in_dims[0];
cout << "OUT size " << outSize << endl;
cout << "IN size " << inSize << endl;
cout << "IN Height " << inHeight << endl;
cout << "IN Width " << inWidth << endl;
cout << "batchSize " << batchSize << endl;
std::vector<std::unique_ptr<vart::TensorBuffer>> inputs, outputs;
vector<Mat> imageList;
int8_t* imageInputs = new int8_t[inSize * batchSize];
float* softmax = new float[outSize];
int8_t* FCResult = new int8_t[batchSize * outSize];
std::vector<vart::TensorBuffer*> inputsPtr, outputsPtr;
std::vector<std::shared_ptr<xir::Tensor>> batchTensors;
for (unsigned int n = 0; n < images.size(); n += batchSize) {
unsigned int runSize =
(images.size() < (n + batchSize)) ? (images.size() - n) : batchSize;
in_dims[0] = runSize;
out_dims[0] = batchSize;
for (unsigned int i = 0; i < runSize; i++)
{
Mat image = imread(baseImagePath + images[n + i]);
Mat image2 = cv::Mat(inHeight, inWidth, CV_8SC3);
resize(image, image2, Size(inHeight, inWidth), 0, 0, INTER_NEAREST);
for (int h = 0; h < inHeight; h++) {
for (int w = 0; w < inWidth; w++) {
for (int c = 0; c < 3; c++) {
imageInputs[i*inSize+h*inWidth*3+w*3 + c] = (int8_t)( (image2.at<Vec3b>(h, w)[c]/255.0f-0.5f)*2*input_scale );
}
}
}
imageList.push_back(image);
}
batchTensors.push_back(std::shared_ptr<xir::Tensor>(
xir::Tensor::create(inputTensors[0]->get_name(), in_dims,
xir::DataType{xir::DataType::XINT, 8u})));
inputs.push_back(std::make_unique<CpuFlatTensorBuffer>(
imageInputs, batchTensors.back().get()));
batchTensors.push_back(std::shared_ptr<xir::Tensor>(
xir::Tensor::create(outputTensors[0]->get_name(), out_dims,
xir::DataType{xir::DataType::XINT, 8u})));
outputs.push_back(std::make_unique<CpuFlatTensorBuffer>(
FCResult, batchTensors.back().get()));
inputsPtr.clear();
outputsPtr.clear();
inputsPtr.push_back(inputs[0].get());
outputsPtr.push_back(outputs[0].get());
auto job_id = runner->execute_async(inputsPtr, outputsPtr);
runner->wait(job_id.first, -1);
for (unsigned int i = 0; i < runSize; i++) {
cout << "\nImage : " << images[n + i] << endl;
CPUCalcSoftmax(&FCResult[i * outSize], outSize, softmax, output_scale);
TopK(softmax, outSize, 5, kinds);
}
imageList.clear();
inputs.clear();
outputs.clear();
}
delete[] FCResult;
delete[] imageInputs;
delete[] softmax;
}
int main(int argc, char* argv[])
{
if (argc != 4) {
cout << "Usage: <executable> <xmodel> <test_images_dir>, <labels_filename>" << endl;
return -1;
}
baseImagePath = std::string(argv[2]);
wordsPath = std::string(argv[3]);
auto graph = xir::Graph::deserialize(argv[1]);
auto subgraph = get_dpu_subgraph(graph.get());
CHECK_EQ(subgraph.size(), 1u)
<< "CNN should have one and only one dpu subgraph.";
LOG(INFO) << "create running for subgraph: " << subgraph[0]->get_name();
auto runner = vart::Runner::create_runner(subgraph[0], "run");
auto inputTensors = runner->get_input_tensors();
auto outputTensors = runner->get_output_tensors();
int inputCnt = inputTensors.size();
int outputCnt = outputTensors.size();
TensorShape inshapes[inputCnt];
TensorShape outshapes[outputCnt];
shapes.inTensorList = inshapes;
shapes.outTensorList = outshapes;
getTensorShape(runner.get(), &shapes, inputCnt, outputCnt);
run_CNN(runner.get());
return 0;
}
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:42:20.506608 1925737 main_int8.cc:314] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
OUT size 10
IN size 3072
IN Height 32
IN Width 32
batchSize 14
Image : truck_5790.png
top[0] prob = 1.000000 name = truck
top[1] prob = 0.000000 name = automobile
top[2] prob = 0.000000 name = airplane
top[3] prob = 0.000000 name = horse
top[4] prob = 0.000000 name = ship
Image : horse_543.png
top[0] prob = 1.000000 name = horse
top[1] prob = 0.000000 name = dog
top[2] prob = 0.000000 name = cat
top[3] prob = 0.000000 name = deer
top[4] prob = 0.000000 name = frog
Image : airplane_5544.png
top[0] prob = 0.998793 name = airplane
top[1] prob = 0.001169 name = cat
top[2] prob = 0.000028 name = dog
top[3] prob = 0.000006 name = horse
top[4] prob = 0.000002 name = deer
Image : horse_9091.png
top[0] prob = 0.999998 name = horse
top[1] prob = 0.000002 name = deer
top[2] prob = 0.000001 name = dog
top[3] prob = 0.000000 name = truck
top[4] prob = 0.000000 name = automobile
…
Image : automobile_4598.png
top[0] prob = 0.999942 name = automobile
top[1] prob = 0.000058 name = truck
top[2] prob = 0.000000 name = cat
top[3] prob = 0.000000 name = frog
top[4] prob = 0.000000 name = airplane
target/cifar10/run_all_cifar10_target.sh/run_cnn_cif10() 부분
bash -x ./cifar10_performance.sh ${TARGET}
echo "PWD4 = " $PWD
cd ..
}
#!/bin/bash
TARGET=$1
echo " "
echo " CIFAR10 RESNET18 TOP5 ACCURACY"
echo " "
python3 ./code/src/check_runtime_top5_cifar10.py -i ./rpt/predictions_cifar10_resnet18.log | tee ./rpt/results_predictions.log
echo " "
echo " CIFAR10 RESNET18 PERFORMANCE (fps)"
echo " "
./get_dpu_fps ./${TARGET}_train1_resnet18_cifar10.xmodel 1 10000 | tee ./rpt/log1.txt
./get_dpu_fps ./${TARGET}_train1_resnet18_cifar10.xmodel 2 10000 | tee ./rpt/log2.txt
./get_dpu_fps ./${TARGET}_train1_resnet18_cifar10.xmodel 3 10000 | tee ./rpt/log3.txt
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import numpy as np
from datetime import datetime
import os
import argparse
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--file", required=True, help="input logfile")
ap.add_argument("-n", "--numel", default="5000", help="number of test images")
args = vars(ap.parse_args())
logfile = args["file"]
try:
f = open(logfile, "r")
except IOError:
print("cannot open ", logfile)
else:
lines = f.readlines()
tot_lines = len(lines)
print(logfile, " has ", tot_lines, " lines")
f.close()
NUMEL = int(args["numel"])
labelNames = { "airplane" : 0, "automobile" : 1, "bird" : 2, "cat" : 3, "deer" : 4, "dog" : 5,
"frog" : 6, "horse" : 7, "ship" : 8, "truck" : 9}
top1_true = 0
top1_false = 0
top5_true = 0
top5_false = 0
img_count = 0
false_pred = 0
test_ids = np.zeros(([NUMEL,1]))
preds = np.zeros(([NUMEL, 1]))
idx = 0
for ln in range(0, tot_lines):
if "Image" in lines[ln]:
top5_lines = lines[ln:ln+6]
filename= top5_lines[0].split("Image :")[1]
s2 = filename.index("_")
class_name = filename[: s2].strip()
predicted = top5_lines[1].split("name = ")[1].strip()
if class_name in top5_lines[1]:
top1_true += 1
top5_true += 1
elif class_name in top5_lines[2]:
top5_true += 1
top1_false +=1
elif class_name in top5_lines[3]:
top5_true += 1
top1_false +=1
elif class_name in top5_lines[4]:
top5_true += 1
top1_false +=1
elif class_name in top5_lines[5]:
top5_true += 1
top1_false +=1
else:
top5_false += 1
top1_false +=1
test_ids[idx] = labelNames[class_name]
preds[idx] = labelNames[predicted ]
img_count +=1
idx += 1
if ( idx == (NUMEL-1) ):
break
else:
continue
assert (top1_true+top1_false) == img_count, "ERROR: top1 true+false not equal to the number of images"
assert (top5_true+top5_false) == img_count, "ERROR: top5 true+false not equal to the number of images"
print("number of total images predicted ", img_count)
print("number of top1 false predictions ", top1_false)
print("number of top1 right predictions ", top1_true)
print("number of top5 false predictions ", top5_false)
print("number of top5 right predictions ", top5_true)
top1_accuracy = float(top1_true)/(top1_true+top1_false)
top5_accuracy = float(top5_true)/(top5_true+top5_false)
print("top1 accuracy = %.2f" % top1_accuracy)
print("top5 accuracy = %.2f" % top5_accuracy)
CIFAR10 RESNET18 TOP5 ACCURACY
+ echo ' '
+ tee ./rpt/results_predictions.log
+ python3 ./code/src/check_runtime_top5_cifar10.py -i ./rpt/predictions_cifar10_resnet18.log
./rpt/predictions_cifar10_resnet18.log has 35008 lines
number of total images predicted 4999
number of top1 false predictions 816
number of top1 right predictions 4183
number of top5 false predictions 37
number of top5 right predictions 4962
top1 accuracy = 0.84
top5 accuracy = 0.99
echo " "
echo " CIFAR10 RESNET18 PERFORMANCE (fps)"
echo " "
./get_dpu_fps ./${TARGET}_train1_resnet18_cifar10.xmodel 1 10000 | tee ./rpt/log1.txt
./get_dpu_fps ./${TARGET}_train1_resnet18_cifar10.xmodel 2 10000 | tee ./rpt/log2.txt
./get_dpu_fps ./${TARGET}_train1_resnet18_cifar10.xmodel 3 10000 | tee ./rpt/log3.txt
cat ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt > ./rpt/${TARGET}_train1_resnet18_cifar10_results_fps.log
rm -f ./rpt/log?.txt
echo " "
#include <assert.h>
#include <dirent.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <chrono>
#include <cassert>
#include <cmath>
#include <cstdio>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <iostream>
#include <queue>
#include <string>
#include <vector>
#include <thread>
#include "common.h"
using namespace std;
using namespace std::chrono;
GraphInfo shapes;
int num_threads = 0;
int num_of_images = 0;
int num_images_x_thread = 0;
void runDPU(vart::Runner* runner, int8_t *imageInputs, int8_t *FCResult)
{
auto outputTensors = runner->get_output_tensors();
auto inputTensors = runner->get_input_tensors();
auto out_dims = outputTensors[0]->get_dims();
auto in_dims = inputTensors[0]->get_dims();
int batchSize = in_dims[0];
int outSize = shapes.outTensorList[0].size;
int inSize = shapes.inTensorList[0].size;
std::vector<std::unique_ptr<vart::TensorBuffer>> inputs, outputs;
std::vector<vart::TensorBuffer*> inputsPtr, outputsPtr;
std::vector<std::shared_ptr<xir::Tensor>> batchTensors;
int8_t *loc_imageInputs = imageInputs;
int8_t *loc_FCResult = FCResult;
for (unsigned int n = 0; n < num_images_x_thread; n += batchSize)
{
loc_imageInputs = imageInputs+n * inSize;
loc_FCResult = FCResult +n * outSize;
batchTensors.push_back(std::shared_ptr<xir::Tensor>(xir::Tensor::create(inputTensors[0]->get_name(),
in_dims, xir::DataType{xir::DataType::XINT, 8u})));
inputs.push_back(std::make_unique<CpuFlatTensorBuffer>(loc_imageInputs, batchTensors.back().get()));
batchTensors.push_back(std::shared_ptr<xir::Tensor>(xir::Tensor::create(outputTensors[0]->get_name(),
out_dims,xir::DataType{xir::DataType::XINT, 8u})));
outputs.push_back(std::make_unique<CpuFlatTensorBuffer>( loc_FCResult, batchTensors.back().get()));
inputsPtr.clear();
outputsPtr.clear();
inputsPtr.push_back(inputs[0].get());
outputsPtr.push_back(outputs[0].get());
auto job_id = runner->execute_async(inputsPtr, outputsPtr);
runner->wait(job_id.first, -1);
inputs.clear();
outputs.clear();
}
}
int main(int argc, char* argv[])
{
if (argc != 4) {
cout << "Usage: get_dpu_fps xmodel_pathname num_of_threads(from 1 to 6) num_of_images" << endl;
return -1;
}
num_threads = atoi(argv[2]);
assert( (num_threads<=6)&(num_threads>=1) );
num_of_images = atoi(argv[3]);
for (int i = 0; i< argc; i++)
cout << argv[i] << " ";
cout << endl;
auto graph = xir::Graph::deserialize(argv[1]);
auto subgraph = get_dpu_subgraph(graph.get());
CHECK_EQ(subgraph.size(), 1u)
<< "CNN should have one and only one dpu subgraph.";
LOG(INFO) << "create running for subgraph: " << subgraph[0]->get_name();
auto runner = vart::Runner::create_runner(subgraph[0], "run");
auto runner1 = vart::Runner::create_runner(subgraph[0], "run");
auto runner2 = vart::Runner::create_runner(subgraph[0], "run");
auto runner3 = vart::Runner::create_runner(subgraph[0], "run");
auto runner4 = vart::Runner::create_runner(subgraph[0], "run");
auto runner5 = vart::Runner::create_runner(subgraph[0], "run");
auto inputTensors = runner->get_input_tensors();
auto outputTensors = runner->get_output_tensors();
auto out_dims = outputTensors[0]->get_shape();
auto in_dims = inputTensors[0]->get_shape();
int inputCnt = inputTensors.size();
int outputCnt = outputTensors.size();
TensorShape inshapes[inputCnt];
TensorShape outshapes[outputCnt];
shapes.inTensorList = inshapes;
shapes.outTensorList = outshapes;
getTensorShape(runner.get(), &shapes, inputCnt, outputCnt);
int outSize = shapes.outTensorList[0].size;
int outHeight = shapes.outTensorList[0].height;
int outWidth = shapes.outTensorList[0].width;
int inSize = shapes.inTensorList[0].size;
int inHeight = shapes.inTensorList[0].height;
int inWidth = shapes.inTensorList[0].width;
int batchSize = in_dims[0];
int num_of_classes = outSize / (outHeight * outWidth);
auto input_scale = get_input_scale(inputTensors[0]);
auto output_scale = get_output_scale(outputTensors[0]);
cout << "outSize " << outSize << endl;
cout << "inSize " << inSize << endl;
cout << "outW " << outWidth << endl;
cout << "outH " << outHeight << endl;
cout << "inpW " << inWidth << endl;
cout << "inpH " << inHeight << endl;
cout << "inp scale " << input_scale << endl;
cout << "out scale " << output_scale << endl;
cout << "# classes " << num_of_classes << endl;
cout << "batchSize " << batchSize << endl;
int num_of_trials = 200;
std::chrono::duration<double, std::micro> avg_calibr_highres(0);
for (int i =0; i<num_of_trials; i++)
{
auto t1 = std::chrono::high_resolution_clock::now();
auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> fp_us = t2 - t1;
avg_calibr_highres += fp_us;
}
cout << "[average calibration high resolution clock] " << avg_calibr_highres.count() / num_of_trials << "us" << endl;
cout << "\n" << endl;
num_images_x_thread = num_of_images/num_threads;
num_images_x_thread = (num_images_x_thread/batchSize)*batchSize;
cout << "\n number of dummy images per thread: " << num_images_x_thread << endl;
num_of_images = num_images_x_thread * num_threads;
int8_t * imageInputs = new int8_t [(num_of_images)*inSize];
int8_t * FCResult = new int8_t [(num_of_images)*outSize];
cout << "\n allocated " << num_of_images* inSize << " bytes for input buffer " << endl;
cout << "\n allocated " << num_of_images*outSize << " bytes for output buffer " << endl;
int8_t *imagesInput0 = imageInputs+ inSize*(num_threads==1 ? 0*num_images_x_thread : 0);
int8_t *imagesInput1 = imageInputs+ inSize*(num_threads==2 ? 1*num_images_x_thread : 0);
int8_t *imagesInput2 = imageInputs+ inSize*(num_threads==3 ? 2*num_images_x_thread : 0);
int8_t *imagesInput3 = imageInputs+ inSize*(num_threads==4 ? 3*num_images_x_thread : 0);
int8_t *imagesInput4 = imageInputs+ inSize*(num_threads==5 ? 4*num_images_x_thread : 0);
int8_t *imagesInput5 = imageInputs+ inSize*(num_threads==6 ? 5*num_images_x_thread : 0);
int8_t *FCResult0 = FCResult+ outSize*(num_threads==1 ? 0*num_images_x_thread : 0);
int8_t *FCResult1 = FCResult+ outSize*(num_threads==2 ? 1*num_images_x_thread : 0);
int8_t *FCResult2 = FCResult+ outSize*(num_threads==3 ? 2*num_images_x_thread : 0);
int8_t *FCResult3 = FCResult+ outSize*(num_threads==4 ? 3*num_images_x_thread : 0);
int8_t *FCResult4 = FCResult+ outSize*(num_threads==5 ? 4*num_images_x_thread : 0);
int8_t *FCResult5 = FCResult+ outSize*(num_threads==6 ? 5*num_images_x_thread : 0);
thread workers[num_threads];
auto dpu_t1 = std::chrono::high_resolution_clock::now();
for (auto i = 0; i < num_threads; i++)
{
if (i == 0) workers[i] = thread(runDPU, runner.get(), ref(imagesInput0), ref(FCResult0) );
if (i == 1) workers[i] = thread(runDPU, runner1.get(), ref(imagesInput1), ref(FCResult1) );
if (i == 2) workers[i] = thread(runDPU, runner2.get(), ref(imagesInput2), ref(FCResult2) );
if (i == 3) workers[i] = thread(runDPU, runner3.get(), ref(imagesInput3), ref(FCResult3) );
if (i == 4) workers[i] = thread(runDPU, runner4.get(), ref(imagesInput4), ref(FCResult4) );
if (i == 5) workers[i] = thread(runDPU, runner5.get(), ref(imagesInput5), ref(FCResult5) );
}
for (auto &w : workers) {
if (w.joinable()) w.join();
}
auto dpu_t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> dpu_time = dpu_t2 - dpu_t1 - avg_calibr_highres;
cout << "\n" << endl;
double dpu_tot_time = (double) dpu_time.count();
double dpu_avg_time = (dpu_tot_time*1000000.0)/num_of_images;
double dpu_avg_fps = (num_of_images*1000000.0)/dpu_tot_time;
cout << "[DPU tot Time ] " << dpu_tot_time << "us" << endl;
cout << "[DPU avg Time ] " << dpu_avg_time << "us" << endl;
cout << "[DPU avg FPS ] " << dpu_avg_fps << endl;
cout << "\n" << endl;
cout << "deleting memory buffer" << endl;
delete[] imageInputs;
delete[] FCResult;
return 0;
}
+ ./get_dpu_fps ./vek280_train1_resnet18_cifar10.xmodel 2 10000
+ tee ./rpt/log2.txt
./get_dpu_fps ./vek280_train1_resnet18_cifar10.xmodel 2 10000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:42:26.378609 1926763 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 10
inSize 3072
outW 1
outH 1
inpW 32
inpH 32
inp scale 64
out scale 0.25
# classes 10
batchSize 14
[average calibration high resolution clock] 0.063us
number of dummy images per thread: 4998
allocated 30707712 bytes for input buffer
allocated 99960 bytes for output buffer
[DPU tot Time ] 590140us
[DPU avg Time ] 5.90376e+07us
[DPU avg FPS ] 16938.4
deleting memory buffer
+ tee ./rpt/log3.txt
+ ./get_dpu_fps ./vek280_train1_resnet18_cifar10.xmodel 3 10000
./get_dpu_fps ./vek280_train1_resnet18_cifar10.xmodel 3 10000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:42:27.949460 1926847 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 10
inSize 3072
outW 1
outH 1
inpW 32
inpH 32
inp scale 64
out scale 0.25
# classes 10
batchSize 14
[average calibration high resolution clock] 0.06015us
number of dummy images per thread: 3332
allocated 30707712 bytes for input buffer
allocated 99960 bytes for output buffer
[DPU tot Time ] 538336us
[DPU avg Time ] 5.38551e+07us
[DPU avg FPS ] 18568.3
deleting memory buffer
+ cat ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ rm -f ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ echo ' '
PWD4 = /home/root/target_vek280/cifar10
end of cifar10
PWD5 = /home/root/target_vek280
clean imagenet
compile imagenet
PWD1 = /home/root/target_vek280/imagenet/code_resnet50
++ dirname ./build_resnet50.sh
+ cd .
+ CXX=g++
++ sed 's/^.*:\s*//'
++ lsb_release -a
++ grep 'Distributor ID'
+ os=petalinux
++ sed 's/^.*:\s*//'
++ lsb_release -a
++ grep Release
+ os_version=2023.1+release-S05010539
++ uname -p
+ arch=unknown
+ target_info=petalinux.2023.1+release-S05010539.unknown
+ install_prefix_default=/home/root/.local/petalinux.2023.1+release-S05010539.unknown
+ g++ --version
g++ (GCC) 12.2.0
Copyright (C) 2022 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ result=0
+ grep opencv4
+ pkg-config --list-all
opencv4 OpenCV - Open Source Computer Vision Library
+ result=1
+ '[' 1 -eq 1 ']'
++ pkg-config --cflags --libs-only-L opencv4
+ OPENCV_FLAGS=-I/usr/include/opencv4
++ basename /home/root/target_vek280/imagenet/code_resnet50
+ name=code_resnet50
+ [[ g++ == *\s\y\s\r\o\o\t* ]]
+ g++ -O2 -fno-inline -I. -I/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/include -I/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/include -L/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/lib -L/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/lib -Wl,-rpath=/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Debug/lib -Wl,-rpath=/home/root/.local/petalinux.2023.1+release-S05010539.unknown.Release/lib -I/home/root/target_vek280/imagenet/code_resnet50/../../common -o code_resnet50 -std=c++17 /home/root/target_vek280/imagenet/code_resnet50/src/main_resnet50.cc /home/root/target_vek280/imagenet/code_resnet50/../../common/common.cpp -Wl,-rpath=/home/root/target_vek280/imagenet/code_resnet50/lib -lvart-runner -I/usr/include/opencv4 -lopencv_videoio -lopencv_imgcodecs -lopencv_highgui -lopencv_imgproc -lopencv_core -lglog -lxir -lunilog -lpthread
PWD2 = /home/root/target_vek280
build imagenet test images
PWD3 = /home/root/target_vek280
run resnet50 CNN
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:02.777024 1927131 main_resnet50.cc:342] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
OUT size 1000
IN size 150528
IN Height 224
IN Width 224
batchSize 14
Image : ILSVRC2012_val_00049764.JPEG out_result = 18 out index = 409
top[0] prob = 0.849117 name = analog clock
top[1] prob = 0.147554 name = wall clock
top[2] prob = 0.001277 name = bell cote, bell cot
top[3] prob = 0.000774 name = barometer
top[4] prob = 0.000470 name = restaurant, eating house, eating place, eatery
Image : ILSVRC2012_val_00049670.JPEG out_result = 21.75 out index = 476
top[0] prob = 0.999958 name = carousel, carrousel, merry-go-round, roundabout, whirligig
top[1] prob = 0.000017 name = shoe shop, shoe-shop, shoe store
top[2] prob = 0.000005 name = wig
top[3] prob = 0.000002 name = lampshade, lamp shade
top[4] prob = 0.000002 name = drum, membranophone, tympan
…
Image : ILSVRC2012_val_00049798.JPEG out_result = 15.25 out index = 794
top[0] prob = 0.873021 name = shower curtain
top[1] prob = 0.033851 name = binder, ring-binder
top[2] prob = 0.015990 name = toilet seat
top[3] prob = 0.007553 name = envelope
top[4] prob = 0.005882 name = menu
Image : ILSVRC2012_val_00049954.JPEG out_result = 12.5 out index = 289
top[0] prob = 0.548879 name = snow leopard, ounce, Panthera uncia
top[1] prob = 0.157256 name = puffer, pufferfish, blowfish, globefish
top[2] prob = 0.095381 name = leopard, Panthera pardus
top[3] prob = 0.027327 name = hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
top[4] prob = 0.021282 name = African crocodile, Nile crocodile, Crocodylus niloticus
+ TARGET=vek280
+ echo ' '
+ echo ' IMAGENET RESNET50 TOP1 ACCURACY ON DPU'
IMAGENET RESNET50 TOP1 ACCURACY ON DPU
+ echo ' '
+ tee resnet50_result_predictions.log
+ python3 ./code_resnet50/src/check_runtime_top1_imagenet.py -i ./rpt/predictions_resnet50_imagenet.log
./rpt/predictions_resnet50_imagenet.log has 3510 lines
number of total images predicted 499
number of top1 false predictions 151
number of top1 right predictions 348
top1 accuracy = 0.70
+ echo ' '
+ echo ' '
+ echo ' IMAGENET RESNET18 TOP1 ACCURACY ON DPU'
IMAGENET RESNET18 TOP1 ACCURACY ON DPU
+ echo ' '
+ python3 ./code_resnet50/src/check_runtime_top1_imagenet.py -i ./rpt/predictions_resnet18_imagenet.log
+ tee resnet18_result_predictions.log
cannot open ./rpt/predictions_resnet18_imagenet.log
Traceback (most recent call last):
File "/home/root/target_vek280/imagenet/./code_resnet50/src/check_runtime_top1_imagenet.py", line 61, in <module>
for ln in range(0, tot_lines):
NameError: name 'tot_lines' is not defined
+ echo ' '
+ echo ' '
+ echo ' IMAGENET RESNET18 PERFORMANCE (fps)'
IMAGENET RESNET18 PERFORMANCE (fps)
+ echo ' '
+ ./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 1 1000
+ tee ./rpt/log1.txt
./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 1 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:12.314563 1927193 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.25
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0809us
number of dummy images per thread: 994
allocated 149624832 bytes for input buffer
allocated 994000 bytes for output buffer
[DPU tot Time ] 240645us
[DPU avg Time ] 2.42098e+08us
[DPU avg FPS ] 4130.56
deleting memory buffer
+ ./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 2 1000
+ tee ./rpt/log2.txt
./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 2 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:13.393136 1927203 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.25
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0808us
number of dummy images per thread: 490
allocated 147517440 bytes for input buffer
allocated 980000 bytes for output buffer
[DPU tot Time ] 133378us
[DPU avg Time ] 1.361e+08us
[DPU avg FPS ] 7347.52
deleting memory buffer
+ tee ./rpt/log3.txt
+ ./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 3 1000
./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 3 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:14.402565 1927214 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.25
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.081us
number of dummy images per thread: 322
allocated 145410048 bytes for input buffer
allocated 966000 bytes for output buffer
[DPU tot Time ] 131436us
[DPU avg Time ] 1.36062e+08us
[DPU avg FPS ] 7349.57
deleting memory buffer
+ cat ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ rm -f ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ echo ' '
+ echo ' IMAGENET RESNET50 PERFORMANCE (fps)'
IMAGENET RESNET50 PERFORMANCE (fps)
+ echo ' '
+ ./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 1 1000
+ tee ./rpt/log1.txt
./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 1 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:15.529371 1927228 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.5
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0807us
number of dummy images per thread: 994
allocated 149624832 bytes for input buffer
allocated 994000 bytes for output buffer
[DPU tot Time ] 314508us
[DPU avg Time ] 3.16406e+08us
[DPU avg FPS ] 3160.5
deleting memory buffer
+ ./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 2 1000
+ tee ./rpt/log2.txt
./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 2 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:17.143707 1927244 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.5
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.06445us
number of dummy images per thread: 490
allocated 147517440 bytes for input buffer
allocated 980000 bytes for output buffer
[DPU tot Time ] 207249us
[DPU avg Time ] 2.11478e+08us
[DPU avg FPS ] 4728.62
deleting memory buffer
+ tee ./rpt/log3.txt
+ ./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 3 1000
./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 3 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:18.662076 1927255 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.5
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.08095us
number of dummy images per thread: 322
allocated 145410048 bytes for input buffer
allocated 966000 bytes for output buffer
[DPU tot Time ] 204447us
[DPU avg Time ] 2.11643e+08us
[DPU avg FPS ] 4724.94
deleting memory buffer
+ cat ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ rm -f ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ echo ' '
PWD4 = /home/root/target_vek280/imagenet
run resnet18 CNN
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:20.056537 1927275 main_resnet50.cc:342] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
OUT size 1000
IN size 150528
IN Height 224
IN Width 224
batchSize 14
Image : ILSVRC2012_val_00049764.JPEG out_result = 14.5 out index = 409
top[0] prob = 0.753228 name = analog clock
top[1] prob = 0.215803 name = wall clock
top[2] prob = 0.010744 name = bell cote, bell cot
top[3] prob = 0.008368 name = barometer
top[4] prob = 0.003953 name = stopwatch, stop watch
Image : ILSVRC2012_val_00049670.JPEG out_result = 7.75 out index = 476
top[0] prob = 0.271265 name = carousel, carrousel, merry-go-round, roundabout, whirligig
top[1] prob = 0.060527 name = shoe shop, shoe-shop, shoe store
top[2] prob = 0.028591 name = skunk, polecat, wood pussy
top[3] prob = 0.028591 name = night snake, Hypsiglena torquata
top[4] prob = 0.022267 name = mosquito net
Image : ILSVRC2012_val_00049821.JPEG out_result = 19.25 out index = 547
top[0] prob = 0.891029 name = electric locomotive
top[1] prob = 0.044362 name = passenger car, coach, carriage
top[2] prob = 0.034549 name = bullet train, bullet
top[3] prob = 0.026907 name = streetcar, tram, tramcar, trolley, trolley car
top[4] prob = 0.001720 name = freight car
…
Image : ILSVRC2012_val_00049798.JPEG out_result = 11.5 out index = 549
top[0] prob = 0.310085 name = envelope
top[1] prob = 0.188076 name = binder, ring-binder
top[2] prob = 0.146474 name = carton
top[3] prob = 0.032683 name = wallet, billfold, notecase, pocketbook
top[4] prob = 0.019823 name = purse
Image : ILSVRC2012_val_00049954.JPEG out_result = 12 out index = 973
top[0] prob = 0.459629 name = coral reef
top[1] prob = 0.102557 name = platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
top[2] prob = 0.079872 name = coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
top[3] prob = 0.062204 name = stingray
top[4] prob = 0.048445 name = eel
+ TARGET=vek280
+ echo ' '
+ echo ' IMAGENET RESNET50 TOP1 ACCURACY ON DPU'
IMAGENET RESNET50 TOP1 ACCURACY ON DPU
+ echo ' '
+ python3 ./code_resnet50/src/check_runtime_top1_imagenet.py -i ./rpt/predictions_resnet50_imagenet.log
+ tee resnet50_result_predictions.log
./rpt/predictions_resnet50_imagenet.log has 3510 lines
number of total images predicted 499
number of top1 false predictions 151
number of top1 right predictions 348
top1 accuracy = 0.70
+ echo ' '
+ echo ' '
+ echo ' IMAGENET RESNET18 TOP1 ACCURACY ON DPU'
IMAGENET RESNET18 TOP1 ACCURACY ON DPU
+ echo ' '
+ python3 ./code_resnet50/src/check_runtime_top1_imagenet.py -i ./rpt/predictions_resnet18_imagenet.log
+ tee resnet18_result_predictions.log
./rpt/predictions_resnet18_imagenet.log has 3510 lines
number of total images predicted 499
number of top1 false predictions 203
number of top1 right predictions 296
top1 accuracy = 0.59
+ echo ' '
+ echo ' '
+ echo ' IMAGENET RESNET18 PERFORMANCE (fps)'
IMAGENET RESNET18 PERFORMANCE (fps)
+ echo ' '
+ ./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 1 1000
+ tee ./rpt/log1.txt
./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 1 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:28.414808 1927331 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.25
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0807us
number of dummy images per thread: 994
allocated 149624832 bytes for input buffer
allocated 994000 bytes for output buffer
[DPU tot Time ] 240883us
[DPU avg Time ] 2.42337e+08us
[DPU avg FPS ] 4126.49
deleting memory buffer
+ ./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 2 1000
+ tee ./rpt/log2.txt
./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 2 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:29.512655 1927341 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.25
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0603us
number of dummy images per thread: 490
allocated 147517440 bytes for input buffer
allocated 980000 bytes for output buffer
[DPU tot Time ] 133163us
[DPU avg Time ] 1.35881e+08us
[DPU avg FPS ] 7359.41
deleting memory buffer
+ ./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 3 1000
+ tee ./rpt/log3.txt
./get_dpu_fps ./vek280_resnet18_imagenet.xmodel 3 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:30.520401 1927352 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_add
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.25
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.06305us
number of dummy images per thread: 322
allocated 145410048 bytes for input buffer
allocated 966000 bytes for output buffer
[DPU tot Time ] 132584us
[DPU avg Time ] 1.37251e+08us
[DPU avg FPS ] 7285.94
deleting memory buffer
+ cat ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ rm -f ./rpt/log1.txt ./rpt/log2.txt ./rpt/log3.txt
+ echo ' '
+ echo ' IMAGENET RESNET50 PERFORMANCE (fps)'
IMAGENET RESNET50 PERFORMANCE (fps)
+ echo ' '
+ ./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 1 1000
+ tee ./rpt/log1.txt
./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 1 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:31.591747 1927366 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.5
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0766us
number of dummy images per thread: 994
allocated 149624832 bytes for input buffer
allocated 994000 bytes for output buffer
[DPU tot Time ] 315518us
[DPU avg Time ] 3.17423e+08us
[DPU avg FPS ] 3150.37
deleting memory buffer
+ tee ./rpt/log2.txt
+ ./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 2 1000
./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 2 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:33.182615 1927382 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.5
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0807us
number of dummy images per thread: 490
allocated 147517440 bytes for input buffer
allocated 980000 bytes for output buffer
[DPU tot Time ] 207362us
[DPU avg Time ] 2.11594e+08us
[DPU avg FPS ] 4726.03
deleting memory buffer
+ ./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 3 1000
+ tee ./rpt/log3.txt
./get_dpu_fps ./vek280_resnet50_imagenet.xmodel 3 1000
WARNING: Logging before InitGoogleLogging() is written to STDERR
I20231011 14:43:34.699926 1927393 get_dpu_fps.cc:107] create running for subgraph: subgraph_quant_avg_pool_fix(TransferPoolFixToDwConv2dFix)
XAIEFAL: INFO: Resource group Avail is created.
XAIEFAL: INFO: Resource group Static is created.
XAIEFAL: INFO: Resource group Generic is created.
outSize 1000
inSize 150528
outW 1
outH 1
inpW 224
inpH 224
inp scale 0.5
out scale 0.25
# classes 1000
batchSize 14
[average calibration high resolution clock] 0.0815us
number of dummy images per thread: 322
allocated 145410048 bytes for input buffer
allocated 966000 bytes for output buffer
[DPU tot Time ] 204596us
[DPU avg Time ] 2.11797e+08us
[DPU avg FPS ] 4721.49
deleting memory buffer
PWD4 = /home/root/target_vek280/imagenet
target/cifar10/run_all_cifar10_target.sh/end_cif10()
end_cif10(){
echo " "
echo "end of cifar10"
echo " "
cd cifar10
rm -rf test
cd ../
echo "PWD5 = " $PWD
}
end of imagenet
PWD5 = /home/root/target_vek280
root@xilinx-vek280-es1-20231:~/target_vek280#