/*
 * Decompiled with CFR 0.152.
 */
package net.sf.okapi.steps.cleanup;

import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.Util;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.steps.cleanup.Parameters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Cleaner {
    private final Logger LOGGER = LoggerFactory.getLogger(this.getClass());
    private static String OPENING_QUOTES_W_SPACE = "([\u00ab\u2039])([\\s\u00a0]+)";
    private static String CLOSING_QUOTES_W_SPACE = "([\\s\u00a0]+)([\u00bb\u203a])";
    private static String DOUBLE_QUOTES = "\u201c|\u201d|\u201e|\u201f|\u00ab|\u00bb";
    private static String DQ_REPLACE = "\"";
    private static String SINGLE_QUOTES = "\u2018|\u2019|\u201a|\u2039|\u203a";
    private static String SQ_REPLACE = "'";
    private static String SPECIALPUNC = "\"'";
    private static final String SINGLEQUOTES = "'\u2018\u2019\u201a\u201b\u2039\u203a";
    private static final String DOUBLEQUOTES = "\"\u201c\u201d\u201e\u201f\u00ab\u00bb";
    private static final String PUNCTUATION = ".,;:!\u00a1?\u00bf";
    private static final String OPENINGQUOTES = "\u2018\u201a\u2039\u201c\u201e\u00ab";
    private static final String CLOSINGQUOTES = "\u2019\u201b\u203a\u201d\u201f\u00bb";
    private static final String MARKS = "'\u2018\u2019\u201a\u201b\u2039\u203a\"\u201c\u201d\u201e\u201f\u00ab\u00bb.,;:!\u00a1?\u00bf";
    private static final String QUOTES = "'\u2018\u2019\u201a\u201b\u2039\u203a\"\u201c\u201d\u201e\u201f\u00ab\u00bb";
    private Parameters params;

    public Cleaner() {
        this(null);
    }

    public Cleaner(Parameters params) {
        this.params = params == null ? new Parameters() : params;
    }

    public boolean run(ITextUnit tu, LocaleId targetLocale) {
        if (!tu.isEmpty()) {
            ISegments srcSegs = tu.getSourceSegments();
            for (Segment srcSeg : srcSegs) {
                Segment trgSeg = tu.getTargetSegment(targetLocale, srcSeg.getId(), false);
                if (trgSeg == null) continue;
                this.normalizeWhitespace(tu, srcSeg, targetLocale);
                if (this.params.getNormalizeQuotes()) {
                    this.normalizeQuotation(tu, srcSeg, targetLocale);
                }
                if (this.params.getCheckCharacters()) {
                    this.checkCharacters(tu, srcSeg, targetLocale);
                }
                if (!this.params.getMatchRegexExpressions()) continue;
                this.matchRegexExpressions(tu, srcSeg, targetLocale);
            }
        }
        return this.pruneTextUnit(tu, targetLocale);
    }

    protected void normalizeWhitespace(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        TextFragment.unwrap(seg.getContent());
        TextFragment.unwrap(tu.getTargetSegment(targetLocale, seg.getId(), false).getContent());
    }

    protected void normalizeQuotation(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        TextFragment trgFragment = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
        String srcText = seg.getContent().getCodedText();
        String trgText = trgFragment.getCodedText();
        Pattern pattern = Pattern.compile(OPENING_QUOTES_W_SPACE);
        Matcher matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll("$1");
        pattern = Pattern.compile(CLOSING_QUOTES_W_SPACE);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll("$2");
        pattern = Pattern.compile(DOUBLE_QUOTES);
        matcher = pattern.matcher(srcText);
        srcText = matcher.replaceAll(DQ_REPLACE).toString();
        pattern = Pattern.compile(SINGLE_QUOTES);
        matcher = pattern.matcher(srcText);
        srcText = matcher.replaceAll(SQ_REPLACE).toString();
        pattern = Pattern.compile(DOUBLE_QUOTES);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll(DQ_REPLACE).toString();
        pattern = Pattern.compile(SINGLE_QUOTES);
        matcher = pattern.matcher(trgText);
        trgText = matcher.replaceAll(SQ_REPLACE).toString();
        seg.getContent().setCodedText(srcText);
        trgFragment.setCodedText(trgText);
    }

    protected void normalizeMarks(ITextUnit tu, Segment seg, LocaleId targetLocale) {
    }

    protected void normalizePunctuation(TextFragment srcFrag, TextFragment trgFrag) {
        int cur;
        StringBuilder srcText = new StringBuilder(srcFrag.getCodedText());
        StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());
        char ch = '\u0000';
        block20: for (cur = 0; cur <= srcText.length() - 1; ++cur) {
            ch = srcText.charAt(cur);
            if (PUNCTUATION.indexOf(ch) == -1) continue;
            switch (ch) {
                case '.': {
                    if (cur < srcText.length() - 1 && (Character.isWhitespace(srcText.charAt(cur + 1)) || srcText.charAt(cur + 1) == '\u00a0')) {
                        srcText.deleteCharAt(cur + 1);
                    }
                    if (cur <= 0 || !Character.isWhitespace(srcText.charAt(cur - 1)) && srcText.charAt(cur - 1) != '\u00a0') continue block20;
                    if (cur == srcText.length() - 1) {
                        srcText.deleteCharAt(cur - 1);
                        --cur;
                        continue block20;
                    }
                    if (cur >= srcText.length() - 1 || !Character.isDigit(srcText.charAt(cur + 1))) continue block20;
                    srcText.deleteCharAt(cur - 1);
                    --cur;
                    continue block20;
                }
                case ',': {
                    if (cur > 0 && cur < srcText.length() - 1 && (Character.isWhitespace(srcText.charAt(cur - 1)) || srcText.charAt(cur - 1) == '\u00a0') && !Character.isDigit(srcText.charAt(cur + 1))) {
                        srcText.deleteCharAt(cur - 1);
                        --cur;
                    }
                    if (cur >= srcText.length() - 1) continue block20;
                    if (Character.isWhitespace(srcText.charAt(cur + 1))) {
                        srcText.deleteCharAt(cur + 1);
                    }
                    char chB = srcText.charAt(cur - 1);
                    char chA = srcText.charAt(cur + 1);
                    boolean dig = Character.isDigit(srcText.charAt(cur + 1));
                    int sp = SPECIALPUNC.indexOf(srcText.charAt(cur + 1));
                    if (Character.isDigit(srcText.charAt(cur + 1)) || SPECIALPUNC.indexOf(srcText.charAt(cur + 1)) != -1) continue block20;
                    srcText.insert(cur + 1, ' ');
                    continue block20;
                }
                case ';': {
                    if (cur > 0 && (Character.isWhitespace(srcText.charAt(cur - 1)) || srcText.charAt(cur - 1) == '\u00a0')) {
                        srcText.deleteCharAt(cur - 1);
                        --cur;
                    }
                    if (cur >= srcText.length() - 1 || Character.isWhitespace(srcText.charAt(cur + 1)) || srcText.charAt(cur + 1) == '\u00a0') continue block20;
                    srcText.insert(cur + 1, ' ');
                    continue block20;
                }
                case ':': {
                    if (cur > 0 && (Character.isWhitespace(srcText.charAt(cur - 1)) || srcText.charAt(cur - 1) == '\u00a0')) {
                        srcText.deleteCharAt(cur - 1);
                        --cur;
                    }
                    if (cur >= srcText.length() - 1 || Character.isWhitespace(srcText.charAt(cur + 1)) || srcText.charAt(cur + 1) == '\u00a0') continue block20;
                    srcText.insert(cur + 1, ' ');
                    continue block20;
                }
                case '!': {
                    if (cur < srcText.length() - 1 && (Character.isWhitespace(srcText.charAt(cur + 1)) || srcText.charAt(cur + 1) == '\u00a0')) {
                        srcText.deleteCharAt(cur + 1);
                    }
                    if (cur <= 0 || !Character.isWhitespace(srcText.charAt(cur - 1)) && srcText.charAt(cur - 1) != '\u00a0') continue block20;
                    srcText.deleteCharAt(cur - 1);
                    --cur;
                    continue block20;
                }
                case '\u00a1': {
                    if (cur < 0 || !Character.isWhitespace(srcText.charAt(cur - 1)) && srcText.charAt(cur - 1) != '\u00a0') continue block20;
                    srcText.deleteCharAt(cur - 1);
                    --cur;
                    continue block20;
                }
                case '?': {
                    if (cur < srcText.length() - 1 && (Character.isWhitespace(srcText.charAt(cur + 1)) || srcText.charAt(cur + 1) == '\u00a0')) {
                        srcText.deleteCharAt(cur + 1);
                    }
                    if (cur <= 0 || !Character.isWhitespace(srcText.charAt(cur - 1)) && srcText.charAt(cur - 1) != '\u00a0') continue block20;
                    srcText.deleteCharAt(cur - 1);
                    --cur;
                    continue block20;
                }
                case '\u00bf': {
                    if (cur < 0 || !Character.isWhitespace(srcText.charAt(cur - 1)) && srcText.charAt(cur - 1) != '\u00a0') continue block20;
                    srcText.deleteCharAt(cur - 1);
                    --cur;
                    continue block20;
                }
            }
        }
        block21: for (cur = 0; cur <= trgText.length() - 1; ++cur) {
            ch = trgText.charAt(cur);
            if (PUNCTUATION.indexOf(ch) == -1) continue;
            switch (ch) {
                case '.': {
                    if (cur < trgText.length() - 1 && (Character.isWhitespace(trgText.charAt(cur + 1)) || trgText.charAt(cur + 1) == '\u00a0')) {
                        trgText.deleteCharAt(cur + 1);
                    }
                    if (cur <= 0 || !Character.isWhitespace(trgText.charAt(cur - 1)) && trgText.charAt(cur - 1) != '\u00a0') continue block21;
                    if (cur == trgText.length() - 1) {
                        trgText.deleteCharAt(cur - 1);
                        --cur;
                        continue block21;
                    }
                    if (cur >= trgText.length() - 1 || !Character.isDigit(trgText.charAt(cur + 1))) continue block21;
                    trgText.deleteCharAt(cur - 1);
                    --cur;
                    continue block21;
                }
                case ',': {
                    if (cur > 0) {
                        if (cur >= trgText.length() - 1 || Character.isDigit(trgText.charAt(cur + 1))) continue block21;
                        trgText.deleteCharAt(cur - 1);
                        --cur;
                    }
                    if (cur >= trgText.length() - 1 || Character.isWhitespace(trgText.charAt(cur + 1)) || trgText.charAt(cur + 1) == '\u00a0') continue block21;
                    trgText.insert(cur + 1, ' ');
                    continue block21;
                }
                case ';': {
                    if (cur > 0 && (Character.isWhitespace(trgText.charAt(cur - 1)) || trgText.charAt(cur - 1) == '\u00a0')) {
                        trgText.deleteCharAt(cur - 1);
                        --cur;
                    }
                    if (cur >= trgText.length() - 1 || Character.isWhitespace(trgText.charAt(cur + 1)) || trgText.charAt(cur + 1) == '\u00a0') continue block21;
                    trgText.insert(cur + 1, ' ');
                    continue block21;
                }
                case ':': {
                    if (cur > 0 && (Character.isWhitespace(trgText.charAt(cur - 1)) || trgText.charAt(cur - 1) == '\u00a0')) {
                        trgText.deleteCharAt(cur - 1);
                        --cur;
                    }
                    if (cur >= trgText.length() - 1 || Character.isWhitespace(trgText.charAt(cur + 1)) || trgText.charAt(cur + 1) == '\u00a0') continue block21;
                    trgText.insert(cur + 1, ' ');
                    continue block21;
                }
                case '!': {
                    if (cur < trgText.length() - 1 && (Character.isWhitespace(trgText.charAt(cur + 1)) || trgText.charAt(cur + 1) == '\u00a0')) {
                        trgText.deleteCharAt(cur + 1);
                    }
                    if (cur <= 0 || !Character.isWhitespace(trgText.charAt(cur - 1)) && trgText.charAt(cur - 1) != '\u00a0') continue block21;
                    trgText.deleteCharAt(cur - 1);
                    --cur;
                    continue block21;
                }
                case '\u00a1': {
                    if (cur < 0 || !Character.isWhitespace(trgText.charAt(cur - 1)) && trgText.charAt(cur - 1) != '\u00a0') continue block21;
                    trgText.deleteCharAt(cur - 1);
                    --cur;
                    continue block21;
                }
                case '?': {
                    if (cur < trgText.length() - 1 && (Character.isWhitespace(trgText.charAt(cur + 1)) || trgText.charAt(cur + 1) == '\u00a0')) {
                        trgText.deleteCharAt(cur + 1);
                    }
                    if (cur <= 0 || !Character.isWhitespace(trgText.charAt(cur - 1)) && trgText.charAt(cur - 1) != '\u00a0') continue block21;
                    trgText.deleteCharAt(cur - 1);
                    --cur;
                    continue block21;
                }
                case '\u00bf': {
                    if (cur < 0 || !Character.isWhitespace(trgText.charAt(cur - 1)) && trgText.charAt(cur - 1) != '\u00a0') continue block21;
                    trgText.deleteCharAt(cur - 1);
                    --cur;
                    continue block21;
                }
            }
        }
        srcFrag.setCodedText(srcText.toString());
        trgFrag.setCodedText(trgText.toString());
    }

    protected void markSegmentForRemoval(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        tu.getTargetSegment(targetLocale, seg.getId(), false).getContent().clear();
    }

    protected void matchRegexExpressions(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        StringBuilder srcText = new StringBuilder(seg.text);
        StringBuilder trgText = new StringBuilder(tu.getTargetSegment((LocaleId)targetLocale, (String)seg.getId(), (boolean)false).text);
        boolean alreadyFound = false;
        if (this.params.getMatchUserRegex() && this.params.getUserRegex() != null && this.params.getUserRegex() != "") {
            try {
                Pattern pattern = Pattern.compile(this.params.getUserRegex());
                if (pattern.matcher(srcText).find() || pattern.matcher(trgText).find()) {
                    alreadyFound = true;
                    this.markSegmentForRemoval(tu, seg, targetLocale);
                }
            }
            catch (PatternSyntaxException patException) {
                this.LOGGER.error("The following error occured \"{}\" in the expression: {}.", (Object)patException.getDescription(), (Object)patException.getPattern());
            }
        }
        if (!alreadyFound) {
            // empty if block
        }
    }

    protected boolean pruneTextUnit(ITextUnit tu, LocaleId targetLocale) {
        if (!tu.isEmpty()) {
            TextContainer tc = tu.getSource();
            ISegments srcSegs = tc.getSegments();
            int cursor = 0;
            while (cursor <= srcSegs.count() - 1) {
                Segment srcSeg = srcSegs.get(cursor);
                Segment trgSeg = tu.getTargetSegment(targetLocale, srcSeg.getId(), false);
                if (cursor < srcSegs.count() - 1) {
                    if (trgSeg.text.isEmpty()) {
                        tc.remove(srcSegs.getIndex(srcSeg.getId()));
                        continue;
                    }
                } else if (trgSeg.text.isEmpty()) {
                    return true;
                }
                ++cursor;
            }
        } else {
            return true;
        }
        return false;
    }

    protected void checkCharacters(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        this.removeCorruptions(tu, seg, targetLocale);
        this.checkUnusualCharacters(tu, seg, targetLocale);
    }

    private void removeCorruptions(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
        StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText());
        StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());
        String corruptionRegex = "\\u00C3[\\u00A4-\\u00B6]|\\u00C3\\u201E|\\u00C3\\u2026|\\u00C3\\u2013";
        Matcher matcher = Pattern.compile(corruptionRegex).matcher(srcText);
        if (matcher.find()) {
            this.markSegmentForRemoval(tu, seg, targetLocale);
        }
        if ((matcher = Pattern.compile(corruptionRegex).matcher(trgText)).find()) {
            this.markSegmentForRemoval(tu, seg, targetLocale);
        }
    }

    private void checkUnusualCharacters(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        TextFragment trgFrag = tu.getTargetSegment(targetLocale, seg.getId(), false).getContent();
        StringBuilder srcText = new StringBuilder(seg.getContent().getCodedText());
        StringBuilder trgText = new StringBuilder(trgFrag.getCodedText());
        boolean isFound = false;
        Pattern pattern = Pattern.compile("[\\u00C0-\\u00FF]{3}");
        if (pattern.matcher(srcText).find() && !isFound) {
            isFound = true;
            this.markSegmentForRemoval(tu, seg, targetLocale);
        }
        if (pattern.matcher(trgText).find() && !isFound) {
            isFound = true;
            this.markSegmentForRemoval(tu, seg, targetLocale);
        }
    }

    private void checkCharacterSet(ITextUnit tu, Segment seg, LocaleId targetLocale) {
        CharsetEncoder encoder1 = null;
        Object encoder2 = null;
        Pattern extraCharsAllowed = null;
        Object itsAllowedChars = null;
        String itsAllowedCharsPattern = "\u0000";
        StringBuilder trgOri = new StringBuilder(tu.getTargetSegment((LocaleId)targetLocale, (String)seg.getId(), (boolean)false).text);
        StringBuilder badChars = new StringBuilder();
        int pos = -1;
        char badChar = '\u0000';
        int count = 0;
        String charsetName = null;
        if (!Util.isEmpty(charsetName)) {
            encoder1 = Charset.forName(charsetName).newEncoder();
        }
        boolean allowExtraCharacters = false;
        String getExtraCharsAllowed = "";
        if (!allowExtraCharacters) {
            extraCharsAllowed = Pattern.compile(getExtraCharsAllowed);
        }
        for (int i = 0; i < trgOri.length(); ++i) {
            Matcher m;
            char ch = trgOri.charAt(i);
            if (encoder1 == null ? extraCharsAllowed != null && (m = extraCharsAllowed.matcher(trgOri.subSequence(i, i + 1))).find() : encoder1.canEncode(ch) || extraCharsAllowed != null && (m = extraCharsAllowed.matcher(trgOri.subSequence(i, i + 1))).find()) continue;
            if (++count > 1) {
                if (badChars.indexOf(String.valueOf(ch)) != -1) continue;
                badChars.append(ch);
                continue;
            }
            pos = i;
            badChar = ch;
        }
    }
}

