Refactor CPDF_SimpleParser part 4
Split CPDF_SimpleParser::GetWord() into multiple private helper methods
to make it much easier to understand.
Add another edge case test to show this refactor doesn't change
behavior.
Change-Id: Ia31fba726d3b5a0b76c14c5da573315062bfdf10
Reviewed-on: https://pdfium-review.googlesource.com/c/pdfium/+/123413
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Andy Phan <andyphan@chromium.org>
diff --git a/core/fpdfapi/parser/cpdf_simple_parser.cpp b/core/fpdfapi/parser/cpdf_simple_parser.cpp
index 38c1bdb..7ed049d 100644
--- a/core/fpdfapi/parser/cpdf_simple_parser.cpp
+++ b/core/fpdfapi/parser/cpdf_simple_parser.cpp
@@ -8,7 +8,10 @@
#include <stdint.h>
+#include <optional>
+
#include "core/fpdfapi/parser/fpdf_parser_utility.h"
+#include "core/fxcrt/check_op.h"
CPDF_SimpleParser::CPDF_SimpleParser(pdfium::span<const uint8_t> input)
: data_(input) {}
@@ -16,29 +19,54 @@
CPDF_SimpleParser::~CPDF_SimpleParser() = default;
ByteStringView CPDF_SimpleParser::GetWord() {
- uint8_t cur_char;
+ std::optional<uint8_t> start_char = SkipSpacesAndComments();
+ if (!start_char.has_value()) {
+ return ByteStringView();
+ }
- // Skip whitespace and comment lines.
+ CHECK_GT(cur_position_, 0);
+ uint32_t start_position = cur_position_ - 1;
+ CHECK_LT(start_position, data_.size());
+
+ if (!PDFCharIsDelimiter(start_char.value())) {
+ return HandleNonDelimiter();
+ }
+
+ switch (start_char.value()) {
+ case '/':
+ return HandleName();
+ case '<':
+ return HandleBeginAngleBracket();
+ case '>':
+ return HandleEndAngleBracket();
+ case '(':
+ return HandleParentheses();
+ default:
+ return ByteStringView(data_.subspan(start_position, 1));
+ }
+}
+
+std::optional<uint8_t> CPDF_SimpleParser::SkipSpacesAndComments() {
while (true) {
if (cur_position_ >= data_.size()) {
- return ByteStringView();
+ return std::nullopt;
}
- cur_char = data_[cur_position_++];
+ uint8_t cur_char = data_[cur_position_++];
while (PDFCharIsWhitespace(cur_char)) {
if (cur_position_ >= data_.size()) {
- return ByteStringView();
+ return std::nullopt;
}
cur_char = data_[cur_position_++];
}
if (cur_char != '%') {
- break;
+ return cur_char;
}
while (true) {
if (cur_position_ >= data_.size()) {
- return ByteStringView();
+ return std::nullopt;
}
cur_char = data_[cur_position_++];
@@ -47,80 +75,93 @@
}
}
}
+}
+ByteStringView CPDF_SimpleParser::HandleName() {
uint32_t start_position = cur_position_ - 1;
- if (PDFCharIsDelimiter(cur_char)) {
- // Find names
- if (cur_char == '/') {
- while (cur_position_ < data_.size()) {
- cur_char = data_[cur_position_];
- // Stop parsing after encountering a whitespace or delimiter.
- if (PDFCharIsWhitespace(cur_char) || PDFCharIsDelimiter(cur_char)) {
- return ByteStringView(
- data_.subspan(start_position, cur_position_ - start_position));
- }
- ++cur_position_;
- }
- return ByteStringView();
+ while (cur_position_ < data_.size()) {
+ uint8_t cur_char = data_[cur_position_];
+ // Stop parsing after encountering a whitespace or delimiter.
+ if (PDFCharIsWhitespace(cur_char) || PDFCharIsDelimiter(cur_char)) {
+ return ByteStringView(
+ data_.subspan(start_position, cur_position_ - start_position));
}
+ ++cur_position_;
+ }
+ return ByteStringView();
+}
- if (cur_char == '<') {
- if (cur_position_ >= data_.size()) {
- return ByteStringView(
- data_.subspan(start_position, cur_position_ - start_position));
- }
- cur_char = data_[cur_position_++];
- if (cur_char != '<') {
- while (cur_position_ < data_.size() && data_[cur_position_] != '>') {
- cur_position_++;
- }
-
- if (cur_position_ < data_.size()) {
- cur_position_++;
- }
- }
- } else if (cur_char == '>') {
- if (cur_position_ >= data_.size()) {
- return ByteStringView(
- data_.subspan(start_position, cur_position_ - start_position));
- }
- if (data_[cur_position_] == '>') {
- ++cur_position_;
- }
- } else if (cur_char == '(') {
- int level = 1;
- while (cur_position_ < data_.size()) {
- if (data_[cur_position_] == ')') {
- level--;
- if (level == 0)
- break;
- }
-
- if (data_[cur_position_] == '\\') {
- if (cur_position_ >= data_.size()) {
- break;
- }
-
- cur_position_++;
- } else if (data_[cur_position_] == '(') {
- level++;
- }
- if (cur_position_ >= data_.size()) {
- break;
- }
-
- cur_position_++;
- }
- if (cur_position_ < data_.size()) {
- cur_position_++;
- }
- }
+ByteStringView CPDF_SimpleParser::HandleBeginAngleBracket() {
+ uint32_t start_position = cur_position_ - 1;
+ if (cur_position_ >= data_.size()) {
return ByteStringView(
data_.subspan(start_position, cur_position_ - start_position));
}
+ if (data_[cur_position_++] != '<') {
+ while (cur_position_ < data_.size() && data_[cur_position_] != '>') {
+ cur_position_++;
+ }
+
+ if (cur_position_ < data_.size()) {
+ cur_position_++;
+ }
+ }
+ return ByteStringView(
+ data_.subspan(start_position, cur_position_ - start_position));
+}
+
+ByteStringView CPDF_SimpleParser::HandleEndAngleBracket() {
+ uint32_t start_position = cur_position_ - 1;
+ if (cur_position_ >= data_.size()) {
+ return ByteStringView(
+ data_.subspan(start_position, cur_position_ - start_position));
+ }
+
+ if (data_[cur_position_] == '>') {
+ ++cur_position_;
+ }
+ return ByteStringView(
+ data_.subspan(start_position, cur_position_ - start_position));
+}
+
+ByteStringView CPDF_SimpleParser::HandleParentheses() {
+ uint32_t start_position = cur_position_ - 1;
+ int level = 1;
while (cur_position_ < data_.size()) {
- cur_char = data_[cur_position_];
+ if (data_[cur_position_] == ')') {
+ level--;
+ if (level == 0) {
+ break;
+ }
+ }
+
+ if (data_[cur_position_] == '\\') {
+ if (cur_position_ >= data_.size()) {
+ break;
+ }
+
+ cur_position_++;
+ } else if (data_[cur_position_] == '(') {
+ level++;
+ }
+ if (cur_position_ >= data_.size()) {
+ break;
+ }
+
+ cur_position_++;
+ }
+ if (cur_position_ < data_.size()) {
+ cur_position_++;
+ }
+ return ByteStringView(
+ data_.subspan(start_position, cur_position_ - start_position));
+}
+
+ByteStringView CPDF_SimpleParser::HandleNonDelimiter() {
+ uint32_t start_position = cur_position_ - 1;
+ while (cur_position_ < data_.size()) {
+ uint8_t cur_char = data_[cur_position_];
if (PDFCharIsDelimiter(cur_char) || PDFCharIsWhitespace(cur_char)) {
break;
}
diff --git a/core/fpdfapi/parser/cpdf_simple_parser.h b/core/fpdfapi/parser/cpdf_simple_parser.h
index c916feb..dfcf1a2 100644
--- a/core/fpdfapi/parser/cpdf_simple_parser.h
+++ b/core/fpdfapi/parser/cpdf_simple_parser.h
@@ -9,6 +9,8 @@
#include <stdint.h>
+#include <optional>
+
#include "core/fxcrt/bytestring.h"
#include "core/fxcrt/span.h"
@@ -23,6 +25,16 @@
uint32_t GetCurrentPosition() const { return cur_position_; }
private:
+ // Skips whitespace and comment lines. Returns the first parseable character
+ // if `data_` can still be parsed, nullopt otherwise.
+ std::optional<uint8_t> SkipSpacesAndComments();
+
+ ByteStringView HandleName();
+ ByteStringView HandleBeginAngleBracket();
+ ByteStringView HandleEndAngleBracket();
+ ByteStringView HandleParentheses();
+ ByteStringView HandleNonDelimiter();
+
const pdfium::span<const uint8_t> data_;
// The current unread position.
diff --git a/core/fpdfapi/parser/cpdf_simple_parser_unittest.cpp b/core/fpdfapi/parser/cpdf_simple_parser_unittest.cpp
index 08abf54..48439d8 100644
--- a/core/fpdfapi/parser/cpdf_simple_parser_unittest.cpp
+++ b/core/fpdfapi/parser/cpdf_simple_parser_unittest.cpp
@@ -51,6 +51,8 @@
STR_IN_OUT_CASE(">> end ", ">>"),
// No ending delimiters.
STR_IN_OUT_CASE("(sdfgfgbcv", "(sdfgfgbcv"),
+ // Other delimiters.
+ STR_IN_OUT_CASE("}", "}"),
// Regular cases.
STR_IN_OUT_CASE("apple pear", "apple"),
STR_IN_OUT_CASE(" pi=3.1415 ", "pi=3.1415"),