中文点选验证码——语序

前面一直在研究文字点选验证码的文字识别，在文字识别之后，对于有语序要求的，还需要我们自己去按照正确的语序顺序去点击才能通过，例如：

外圆内方

积少成多

得道者多助

利用结巴分词技术可以解决此问题，但是仅限于词语效果较好，对于古诗或者谚语等效果比较差；其中还借助了结巴分词得dict.txt，其中保存了常见词得词频和词性，在使用得时候可以根据自己的需求进行调整。

import jieba
import os
from itertools import permutations


# 获得汉字的所有排列方式
def permutation(text, r=None):
    word_list = list(permutations(text, r))
    for i in range(len(word_list)):
        word_list[i] = ''.join(word_list[i])
    return word_list


# 寻找列表中最长的词(的索引)
def find_longest(seg_list):
    longest_text = max(seg_list, key=len)
    index = seg_list.index(longest_text)
    return index


# 将文件数据转换为字典
def file2dict(filename):
    with open(filename, encoding='utf-8') as f:
        array_lines = f.readlines()
    return_dict = {}
    for line in array_lines:
        line = line.strip()
        list_from_line = line.split()
        return_dict[list_from_line[0]] = int(list_from_line[1])
    return return_dict


# 对输入的字典根据key大小排序
def sorted_dict_values(possible_dict):
    return [(k, possible_dict[k]) for k in sorted(possible_dict.keys())]


# 输入词列表，返回结巴分词内词频最高的词
def highest_frequency(possible_words):
    # dict.txt是词典，统计了每个词的次数和词性
    word_dict = file2dict('dict.txt')
    possible_dict = {}
    for possible_word in possible_words:
        possible_dict[word_dict[possible_word]] = possible_word
    sorted_list = sorted_dict_values(possible_dict)
    print(sorted_list)
    return sorted_list[-1][1]


# 结巴分词，识别语序
def recognize_order_jieba(text):
    length = len(text)
    # 获取该字符串的所有排列方式
    word_list = permutation(text)
    possible_words = []  # 用来存放语序可能正确的词
    for word in word_list:
        seg_list = jieba.lcut(word, cut_all=True)
        # 寻找结巴分词返回的列表中字符串最长的索引
        index = find_longest(seg_list)
        # 若最长的字符串与输入的字符串长度相同，则加入可能正确列表
        if len(seg_list[index]) == length:
            possible_words.append(seg_list[index])
    # 遍历完成后，若可能正确的列表只有一个元素，那么它就是正确的
    if len(possible_words) == 1:
        return possible_words[0]
    # 若有可能正确列表里有多个元素，则选取词频高的返回
    elif len(possible_words) > 1:
        return highest_frequency(possible_words)
    else:
        return 0


if __name__ == "__main__":
    input_text = '铁磨成杵针'
    result = recognize_order_jieba(input_text)
    if result:
        print(result)
    else:
        print('未识别出结果')

测试结果：

方外内圆 –> 外圆内方

成少积多 –> 积少成多

者多道得助 –> 未识别出结果

铁磨成杵针 –> 铁杵磨成针