123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566 |
- # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import random
- from utils.logging import get_logger
- class FileCorpus(object):
- def __init__(self, config):
- self.logger = get_logger()
- self.logger.info("using FileCorpus")
- self.char_list = " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
- corpus_file = config["CorpusGenerator"]["corpus_file"]
- self.language = config["CorpusGenerator"]["language"]
- with open(corpus_file, 'r') as f:
- corpus_raw = f.read()
- self.corpus_list = corpus_raw.split("\n")[:-1]
- assert len(self.corpus_list) > 0
- random.shuffle(self.corpus_list)
- self.index = 0
- def generate(self, corpus_length=0):
- if self.index >= len(self.corpus_list):
- self.index = 0
- random.shuffle(self.corpus_list)
- corpus = self.corpus_list[self.index]
- if corpus_length != 0:
- corpus = corpus[0:corpus_length]
- if corpus_length > len(corpus):
- self.logger.warning("generated corpus is shorter than expected.")
- self.index += 1
- return self.language, corpus
- class EnNumCorpus(object):
- def __init__(self, config):
- self.logger = get_logger()
- self.logger.info("using NumberCorpus")
- self.num_list = "0123456789"
- self.en_char_list = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
- self.height = config["Global"]["image_height"]
- self.max_width = config["Global"]["image_width"]
- def generate(self, corpus_length=0):
- corpus = ""
- if corpus_length == 0:
- corpus_length = random.randint(5, 15)
- for i in range(corpus_length):
- if random.random() < 0.2:
- corpus += "{}".format(random.choice(self.en_char_list))
- else:
- corpus += "{}".format(random.choice(self.num_list))
- return "en", corpus
|