package opennlp.tools.formats.muc;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import opennlp.tools.formats.muc.SgmlParser;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import org.apache.tika.parser.ner.NERecogniser;

/* loaded from: input_file:WEB-INF/lib/opennlp-tools-1.8.4.jar:opennlp/tools/formats/muc/MucNameContentHandler.class */
public class MucNameContentHandler extends SgmlParser.ContentHandler {
    private static final String ENTITY_ELEMENT_NAME = "ENAMEX";
    private static final String TIME_ELEMENT_NAME = "TIMEX";
    private static final String NUM_ELEMENT_NAME = "NUMEX";
    private static final Set<String> NAME_ELEMENT_NAMES;
    private static final Set<String> EXPECTED_TYPES;
    private final Tokenizer tokenizer;
    private final List<NameSample> storedSamples;
    private boolean isInsideContentElement = false;
    private final List<String> text = new ArrayList();
    private boolean isClearAdaptiveData = false;
    private final Stack<Span> incompleteNames = new Stack<>();
    private List<Span> names = new ArrayList();

    public MucNameContentHandler(Tokenizer tokenizer, List<NameSample> list) {
        this.tokenizer = tokenizer;
        this.storedSamples = list;
    }

    @Override // opennlp.tools.formats.muc.SgmlParser.ContentHandler
    public void startElement(String str, Map<String, String> map) throws InvalidFormatException {
        if ("DOC".equals(str)) {
            this.isClearAdaptiveData = true;
        }
        if (MucElementNames.CONTENT_ELEMENTS.contains(str)) {
            this.isInsideContentElement = true;
        }
        if (NAME_ELEMENT_NAMES.contains(str)) {
            String str2 = map.get("TYPE");
            if (!EXPECTED_TYPES.contains(str2)) {
                throw new InvalidFormatException("Unknown timex, numex or namex type: " + str2 + ", expected one of " + EXPECTED_TYPES);
            }
            this.incompleteNames.add(new Span(this.text.size(), this.text.size(), str2.toLowerCase(Locale.ENGLISH)));
        }
    }

    @Override // opennlp.tools.formats.muc.SgmlParser.ContentHandler
    public void characters(CharSequence charSequence) {
        if (this.isInsideContentElement) {
            this.text.addAll(Arrays.asList(this.tokenizer.tokenize(charSequence.toString())));
        }
    }

    @Override // opennlp.tools.formats.muc.SgmlParser.ContentHandler
    public void endElement(String str) {
        if (NAME_ELEMENT_NAMES.contains(str)) {
            Span pop = this.incompleteNames.pop();
            this.names.add(new Span(pop.getStart(), this.text.size(), pop.getType()));
        }
        if (MucElementNames.CONTENT_ELEMENTS.contains(str)) {
            this.storedSamples.add(new NameSample((String[]) this.text.toArray(new String[this.text.size()]), (Span[]) this.names.toArray(new Span[this.names.size()]), this.isClearAdaptiveData));
            if (this.isClearAdaptiveData) {
                this.isClearAdaptiveData = false;
            }
            this.text.clear();
            this.names.clear();
            this.isInsideContentElement = false;
        }
    }

    static {
        HashSet hashSet = new HashSet();
        hashSet.add(NERecogniser.PERSON);
        hashSet.add(NERecogniser.ORGANIZATION);
        hashSet.add(NERecogniser.LOCATION);
        hashSet.add(NERecogniser.DATE);
        hashSet.add("TIME");
        hashSet.add(NERecogniser.MONEY);
        hashSet.add(NERecogniser.PERCENT);
        EXPECTED_TYPES = Collections.unmodifiableSet(hashSet);
        HashSet hashSet2 = new HashSet();
        hashSet2.add(ENTITY_ELEMENT_NAME);
        hashSet2.add(TIME_ELEMENT_NAME);
        hashSet2.add(NUM_ELEMENT_NAME);
        NAME_ELEMENT_NAMES = Collections.unmodifiableSet(hashSet2);
    }
}
