OpenVINO场景文字检测与识别

领域专家: Java技术领域

2020-10-16 12:29:27

OpenVINO提供的场景文字检测模型准确率是非常的高，完全可以达到实用级别，其实OpenVINO还提供了另外一个场景文字识别的模型，总体使用下来的感觉是没有场景文字检测那么靠谱，而且只支持英文字母与数字识别，不支持中文，不得不说是一个小小遗憾，但是对比较干净的文档图像，它的识别准确率还是相当的高，速度也比较快，基本上都在毫秒基本出结果。

模型介绍
文本识别(OCR)模型采用的网络架构为基础网络+双向LSTM，其中基础网络选择的是VGG16，字母识别是非大小写敏感的，26个字母+10个数字总计36个字符。其网络结构类似如下：

模型输入结构为：

[BxCxHxW]=1x1x32x120

其中B表示批次、C表示通道、H表示高度、W表示宽度
模型输出结果为：

[WxBxL] = 30x1x37

其中B表示批次、W表示输出序列长度、L表示各个37个字符各自得分，其中第37个是#
输出部分的解析基于CTC贪心解码方式。
代码实现

加载模型

# 加载IR

log.info("Reading IR...")

net = IENetwork(model=model_xml, weights=model_bin)

text_net = IENetwork(model=text_xml, weights=text_bin)

场景文字检测

# image = cv2.imread("D:/images/openvino_ocr.png");

image = cv2.imread("D:/images/cover_01.jpg");

cv2.imshow("image", image)

inf_start = time.time()

in_frame = cv2.resize(image, (w, h))

in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW

in_frame = in_frame.reshape((n, c, h, w))

exec_net.infer(inputs={input_blob: in_frame})

ROI截取与文字识别

x, y, width, height = cv2.boundingRect(contours[c])

roi = image[y-5:y+height+10,x-5:x+width+10,:]

gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

text_roi = cv2.resize(gray, (tw, th))

text_roi = np.expand_dims(text_roi, 2)

text_roi = text_roi.transpose((2, 0, 1))

text_roi = text_roi.reshape((tn, tc, th, tw))

text_exec_net.infer(inputs={input_blob: text_roi})

text_out = text_exec_net.requests[0].outputs[text_out_blob]

CTC解析结果

# 解析输出text

ocrstr = ""

prev_pad = False;

for i in range(text_out.shape[0]):

    ctc = text_out[i]

    ctc = np.squeeze(ctc, 0)

    index, prob = ctc_soft_max(ctc)

    if alphabet[index] == '#':

        prev_pad = True

    else:

        if len(ocrstr) == 0 or prev_pad or (len(ocrstr) > 0 and alphabet[index] != ocrstr[-1]):

            prev_pad = False

            ocrstr += alphabet[index]

输出文字检测与识别结果

# 显示识别结果

print("result: %s"%ocrstr)

cv2.drawContours(image, [box], 0, (0, 255, 0), 2)

cv2.putText(image, ocrstr, (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.75, (255, 0, 0), 1)

最后送上整个演示代码

def demo():

    # 加载MKLDNN - CPU Target

    log.basicConfig(format="[ %(levelname)s ] %(message)s", level=log.INFO, stream=sys.stdout)

    plugin = IEPlugin(device="CPU", plugin_dirs=plugin_dir)

    plugin.add_cpu_extension(cpu_extension)



    # 加载IR

    log.info("Reading IR...")

    net = IENetwork(model=model_xml, weights=model_bin)

    text_net = IENetwork(model=text_xml, weights=text_bin)



    if plugin.device == "CPU":

        supported_layers = plugin.get_supported_layers(net)

        not_supported_layers = [l for l in net.layers.keys() if l not in supported_layers]

        if len(not_supported_layers) != 0:

            log.error("Following layers are not supported by the plugin for specified device {}:\n {}".

                      format(plugin.device, ', '.join(not_supported_layers)))

            log.error("Please try to specify cpu extensions library path in demo's command line parameters using -l "

                      "or --cpu_extension command line argument")

            sys.exit(1)



    # 获取输入输出层

    input_blob = next(iter(net.inputs))

    outputs = iter(net.outputs)



    # 获取多个输出层名称

    out_blob = next(outputs)

    second_blob = next(outputs)

    log.info("Loading IR to the plugin...")

    print("pixel output: %s, link output: %s \n"%(out_blob, second_blob))



    text_input_blob = next(iter(text_net.inputs))

    text_out_blob = next(iter(text_net.outputs))

    print("text_out_blob : %s"%text_out_blob)



    # 创建可执行网络

    exec_net = plugin.load(network=net)

    text_exec_net = plugin.load(network=text_net)



    # Read and pre-process input image

    n, c, h, w = net.inputs[input_blob].shape

    tn, tc, th, tw = text_net.inputs[text_input_blob].shape

    del net

    del text_net



    log.info("Starting inference in async mode...")

    log.info("To switch between sync and async modes press Tab button")

    log.info("To stop the demo execution press Esc button")



    image = cv2.imread("D:/images/openvino_ocr.png");

    # image = cv2.imread("D:/images/cover_01.jpg");

    cv2.imshow("image", image)

    inf_start = time.time()

    in_frame = cv2.resize(image, (w, h))

    in_frame = in_frame.transpose((2, 0, 1))  # Change data layout from HWC to CHW

    in_frame = in_frame.reshape((n, c, h, w))

    exec_net.infer(inputs={input_blob: in_frame})

    inf_end = time.time()

    det_time = inf_end - inf_start



    # 获取输出

    res1 = exec_net.requests[0].outputs[out_blob]

    res2 = exec_net.requests[0].outputs[second_blob]



    # 降维

    res1 = np.squeeze(res1, 0)

    res2 = np.squeeze(res2, 0)



    # 矩阵转置

    res1 = res1.transpose((1, 2, 0))

    res2 = res2.transpose((1, 2, 0))



    h, w = res1.shape[:2]

    print(res1.shape)

    print(res2.shape)



    # 文本与非文本像素

    pixel_mask = np.zeros((h, w), dtype=np.uint8)



    # 解析输出结果

    res1 = soft_max(res1)



    # 像素分割

    for row in range(h):

        for col in range(w):

            pv2 = res1[row, col, 1]

            if pv2 > 0.50:

                pixel_mask[row, col] = 255



    se = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 1))

    mask = cv2.morphologyEx(pixel_mask, cv2.MORPH_CLOSE, se)

    cv2.imshow("text mask", mask)

    cv2.imwrite("D:/mask.png", mask)



    # 后处理，检测框

    h, w = image.shape[:2]

    mask = cv2.resize(mask, (w, h))

    contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for c in range(len(contours)):

        rect = cv2.minAreaRect(contours[c])

        box = cv2.boxPoints(rect)

        box = np.int0(box)



        x, y, width, height = cv2.boundingRect(contours[c])

        roi = image[y-5:y+height+10,x-5:x+width+10,:]

        gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)

        text_roi = cv2.resize(gray, (tw, th))

        text_roi = np.expand_dims(text_roi, 2)

        text_roi = text_roi.transpose((2, 0, 1))

        text_roi = text_roi.reshape((tn, tc, th, tw))

        text_exec_net.infer(inputs={input_blob: text_roi})

        text_out = text_exec_net.requests[0].outputs[text_out_blob]



        # 解析输出text

        ocrstr = ""

        prev_pad = False;

        for i in range(text_out.shape[0]):

            ctc = text_out[i]

            ctc = np.squeeze(ctc, 0)

            index, prob = ctc_soft_max(ctc)

            if alphabet[index] == '#':

                prev_pad = True

            else:

                if len(ocrstr) == 0 or prev_pad or (len(ocrstr) > 0 and alphabet[index] != ocrstr[-1]):

                    prev_pad = False

                    ocrstr += alphabet[index]



        # 显示识别结果

        print("result: %s"%ocrstr)

        cv2.drawContours(image, [box], 0, (0, 255, 0), 2)

        cv2.putText(image, ocrstr, (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.75, (255, 0, 0), 1)



    inf_time_message = "Inference time: {:.3f} ms， FPS:{:.3f}".format(det_time * 1000, 1000 / (det_time * 1000))

    cv2.putText(image, inf_time_message, (15, 15), cv2.FONT_HERSHEY_COMPLEX, 0.5, (255, 255, 0), 1)

    cv2.imshow("result", image)

    cv2.imwrite("D:/result.png", image)

    cv2.waitKey(0)



    # 释放资源

    cv2.destroyAllWindows()

    del exec_net

    del plugin