kaarbe/html-extractor:从HTML中提取文本的简单Java库


从具有HTML标记的输入中提取文本。只会传回包含在有效HTML标记中的文字。无效
* 标记、标记之间的文本以及不带任何HTML的文本将在创建最终结果之前被删除。

package com.github.kaarbe;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;

public class HtmlExtractor {

  private static final String EMPTY_STRING = "";

  private HtmlExtractor() {
  }


  public static String extract(final String input, final boolean shouldTrim) {
    if (!Validator.isValid(input)) {
      return EMPTY_STRING;
    }
    List<Character> chars = shouldTrim ? Trimmer.trim(toCharList(input)) : toCharList(input);

    int index = 0;
    while (mayContainHtmlTag(chars) && index < chars.size()) {
      HtmlTag firstTag = HtmlTag.findOne(0, chars);
      index = firstTag.getEndIndex();

      HtmlTag secondTag = HtmlTag.findOne(index, chars);
      index = secondTag.getEndIndex();

      while (!(firstTag.isOpening() && secondTag.isClosing())) {
        firstTag = secondTag;
        secondTag = HtmlTag.findOne(index, chars);
        index = secondTag.getEndIndex();
      }

      chars = firstTag.isPairWith(secondTag)
          ? getWithoutTags(chars, firstTag, secondTag)
          : getWithoutTagsAndContent(chars, firstTag, secondTag);
    }
    return getWithoutRemainingTags(chars)
        .stream()
        .map(Objects::toString)
        .collect(Collectors.joining(EMPTY_STRING));
  }

  private static List<Character> toCharList(final String text) {
    return text
        .chars()
        .mapToObj(codePointValue -> (char) codePointValue)
        .collect(Collectors.toList());
  }

  private static boolean mayContainHtmlTag(final List<Character> chars) {
    return chars.contains('<')
        && chars.contains('>');
  }

  private static List<Character> getWithoutTags(
      final List<Character> chars, final HtmlTag opening, final HtmlTag closing) {
    List<Character> beforeOpeningTag = chars.subList(0, opening.getStartIndex());
    List<Character> inBetweenTags = chars.subList(opening.getEndIndex() + 1, closing.getStartIndex());
    List<Character> afterClosingTag = chars.subList(closing.getEndIndex() + 1, chars.size());
    List<Character> remainingTags =
        new ArrayList<>(beforeOpeningTag.size() + inBetweenTags.size() + afterClosingTag.size());
    remainingTags.addAll(beforeOpeningTag);
    remainingTags.addAll(inBetweenTags);
    remainingTags.addAll(afterClosingTag);
    return remainingTags;
  }

  private static List<Character> getWithoutTagsAndContent(
      final List<Character> chars, final HtmlTag firstTag, final HtmlTag secondTag) {
    List<Character> remainingChars = new ArrayList<>(chars);
    remainingChars
        .subList(firstTag.getStartIndex(), secondTag.getEndIndex() + 1)
        .clear();
    return remainingChars;
  }

  private static List<Character> getWithoutRemainingTags(final List<Character> chars) {
    if (!mayContainHtmlTag(chars)) {
      return chars;
    }
    List<Character> remainingChars = new ArrayList<>(chars);
    while (mayContainHtmlTag(remainingChars)) {
      int index = 0;
      HtmlTag tag = HtmlTag.findOne(index, remainingChars);
      index = tag.getEndIndex();
      if (Identifier.isTagClosingChar(remainingChars.get(index))) {
        remainingChars
            .subList(tag.getStartIndex(), tag.getEndIndex() + 1)
            .clear();
      }
    }
    return remainingChars;
  }
}

点击标题见完整代码