Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OSSFuzz Integration #949

Merged
merged 21 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/cifuzz.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: CIFuzz
on:
push:
branches:
- master
pull_request:
permissions: {}
jobs:
Fuzzing:
runs-on: ubuntu-latest
permissions:
security-events: write
steps:
- name: Build Fuzzers
id: build
uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
with:
oss-fuzz-project-name: 'pdfminersix'
language: python
- name: Run Fuzzers
uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
with:
oss-fuzz-project-name: 'pdfminersix'
language: python
fuzz-seconds: 800
output-sarif: true
- name: Upload Crash
uses: actions/upload-artifact@v3
if: failure() && steps.build.outcome == 'success'
with:
name: artifacts
path: ./out/artifacts
- name: Upload Sarif
if: always() && steps.build.outcome == 'success'
uses: github/codeql-action/upload-sarif@v2
with:
# Path to SARIF file relative to the root of the repository
sarif_file: cifuzz-sarif/results.sarif
checkout_path: cifuzz-sarif
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
### Added

- Support for zipped jpeg's ([#938](https://github.com/pdfminer/pdfminer.six/pull/938))

- Fuzzing harnesses for integration into Google's OSS-Fuzz ([949](https://github.com/pdfminer/pdfminer.six/pull/949))
- Support for setuptools-git-versioning version 2.0.0 ([#957](https://github.com/pdfminer/pdfminer.six/pull/957))

### Fixed
Expand Down
Empty file added fuzzing/__init__.py
Empty file.
10 changes: 10 additions & 0 deletions fuzzing/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cd "$SRC"/pdfminer.six
pip3 install .[dev]

# Build fuzzers in $OUT
for fuzzer in $(find fuzzing -name '*_fuzzer.py');do
compile_python_fuzzer "$fuzzer" --collect-all charset_normalizer --hidden-import=_cffi_backend
base_name=$(basename "$fuzzer")
base_name_no_ext=${base_name%.*}
zip -q $OUT/"$base_name_no_ext".zip $SRC/corpus/*
done
39 changes: 39 additions & 0 deletions fuzzing/extract_text_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys

import atheris

from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider

with atheris.instrument_imports():
from fuzzing.utils import (
prepare_pdfminer_fuzzing,
is_valid_byte_stream,
generate_layout_parameters,
)
from pdfminer.high_level import extract_text

from pdfminer.psexceptions import PSException


def fuzz_one_input(data: bytes) -> None:
if not is_valid_byte_stream(data):
# Not worth continuing with this test case
return

fdp = PdfminerFuzzedDataProvider(data)

try:
extract_text(
fdp.ConsumeMemoryFile(),
maxpages=fdp.ConsumeIntInRange(0, 10),
page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
laparams=generate_layout_parameters(fdp),
)
except (AssertionError, PSException):
return


if __name__ == "__main__":
prepare_pdfminer_fuzzing()
atheris.Setup(sys.argv, fuzz_one_input)
atheris.Fuzz()
49 changes: 49 additions & 0 deletions fuzzing/extract_text_to_fp_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import io
import sys

import atheris

from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider

with atheris.instrument_imports():
from fuzzing.utils import (
prepare_pdfminer_fuzzing,
is_valid_byte_stream,
generate_layout_parameters,
)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.psexceptions import PSException

available_output_formats = ["text", "html", "xml", "tag"]
available_layout_modes = ["exact", "normal", "loose"]


def fuzz_one_input(data: bytes) -> None:
if not is_valid_byte_stream(data):
# Not worth continuing with this test case
return

fdp = PdfminerFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile(all_data=False) as f_in, io.BytesIO() as f_out:
extract_text_to_fp(
f_in,
f_out,
output_type=fdp.PickValueInList(available_output_formats),
laparams=generate_layout_parameters(fdp),
maxpages=fdp.ConsumeIntInRange(0, 10),
page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
scale=fdp.ConsumeFloatInRange(0.0, 2.0),
rotation=fdp.ConsumeIntInRange(0, 360),
layoutmode=fdp.PickValueInList(available_layout_modes),
strip_control=fdp.ConsumeBool(),
)
except (AssertionError, PSException):
return


if __name__ == "__main__":
prepare_pdfminer_fuzzing()
atheris.Setup(sys.argv, fuzz_one_input)
atheris.Fuzz()
34 changes: 34 additions & 0 deletions fuzzing/fuzzed_data_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import io
from typing import List, Optional

from atheris import FuzzedDataProvider


class PdfminerFuzzedDataProvider(FuzzedDataProvider): # type: ignore[misc]
def ConsumeRandomBytes(self) -> bytes:
int_range = self.ConsumeIntInRange(0, self.remaining_bytes())
return bytes(self.ConsumeBytes(int_range))

def ConsumeRandomString(self) -> str:
int_range = self.ConsumeIntInRange(0, self.remaining_bytes())
return str(self.ConsumeUnicodeNoSurrogates(int_range))

def ConsumeRemainingString(self) -> str:
return str(self.ConsumeUnicodeNoSurrogates(self.remaining_bytes()))

def ConsumeRemainingBytes(self) -> bytes:
return bytes(self.ConsumeBytes(self.remaining_bytes()))

def ConsumeMemoryFile(self, all_data: bool = False) -> io.BytesIO:
if all_data:
return io.BytesIO(self.ConsumeRemainingBytes())
else:
return io.BytesIO(self.ConsumeRandomBytes())

def ConsumeOptionalIntList(
self, max_count: int, min: int, max: int
) -> Optional[List[int]]:
if self.ConsumeBool():
count = self.ConsumeIntInRange(0, max_count)
return [int(i) for i in self.ConsumeIntListInRange(count, min, max)]
return None
41 changes: 41 additions & 0 deletions fuzzing/page_extraction_fuzzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3
import atheris
import sys

from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider

with atheris.instrument_imports():
from fuzzing.utils import (
prepare_pdfminer_fuzzing,
is_valid_byte_stream,
generate_layout_parameters,
)
from pdfminer.high_level import extract_pages
from pdfminer.psexceptions import PSException


def fuzz_one_input(data: bytes) -> None:
if not is_valid_byte_stream(data):
# Not worth continuing with this test case
return

fdp = PdfminerFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile() as f:
list(
extract_pages(
f,
maxpages=fdp.ConsumeIntInRange(0, 10),
page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
laparams=generate_layout_parameters(fdp),
)
)
except (AssertionError, PSException):
return


if __name__ == "__main__":
prepare_pdfminer_fuzzing()
atheris.Setup(sys.argv, fuzz_one_input)
atheris.Fuzz()
53 changes: 53 additions & 0 deletions fuzzing/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Utilities shared across the various PDF fuzzing harnesses
"""
import logging
from typing import Optional

import atheris

from pdfminer.layout import LAParams

PDF_MAGIC_BYTES = b"%PDF-"


def prepare_pdfminer_fuzzing() -> None:
"""
Used to disable logging of the pdfminer module
"""
logging.getLogger("pdfminer").setLevel(logging.CRITICAL)


@atheris.instrument_func # type: ignore[misc]
def generate_layout_parameters(
fdp: atheris.FuzzedDataProvider,
) -> Optional[LAParams]:
if fdp.ConsumeBool():
return None

boxes_flow: Optional[float] = None
if fdp.ConsumeBool():
boxes_flow = fdp.ConsumeFloatInRange(-1.0, 1.0)

return LAParams(
line_overlap=fdp.ConsumeFloat(),
char_margin=fdp.ConsumeFloat(),
line_margin=fdp.ConsumeFloat(),
word_margin=fdp.ConsumeFloat(),
boxes_flow=boxes_flow,
detect_vertical=fdp.ConsumeBool(),
all_texts=fdp.ConsumeBool(),
)


@atheris.instrument_func # type: ignore[misc]
def is_valid_byte_stream(data: bytes) -> bool:
"""Quick check to see if this is worth of passing to atheris
:return: Whether the byte-stream passes the basic checks
"""
if not data.startswith(PDF_MAGIC_BYTES):
return False
if b"/Root" not in data:
return False

return True
5 changes: 4 additions & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,7 @@ ignore_missing_imports = True
ignore_missing_imports = True

[mypy-charset_normalizer.*]
ignore_missing_imports = True
ignore_missing_imports = True

[mypy-atheris.*]
ignore_missing_imports = True
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


PYTHON_ALL_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]
PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"]
PYTHON_MODULES = ["fuzzing", "pdfminer", "tools", "tests", "noxfile.py", "setup.py"]


@nox.session
Expand Down
6 changes: 4 additions & 2 deletions pdfminer/_saslprep.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from typing import Callable, Tuple
import unicodedata

from .pdfexceptions import PDFValueError

# RFC4013 section 2.3 prohibited output.
_PROHIBITED: Tuple[Callable[[str], bool], ...] = (
# A strict reading of RFC 4013 requires table c12 here, but
Expand Down Expand Up @@ -77,7 +79,7 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
# RFC3454, Section 6, #3. If a string contains any
# RandALCat character, the first and last characters
# MUST be RandALCat characters.
raise ValueError("SASLprep: failed bidirectional check")
raise PDFValueError("SASLprep: failed bidirectional check")
# RFC3454, Section 6, #2. If a string contains any RandALCat
# character, it MUST NOT contain any LCat character.
prohibited = prohibited + (stringprep.in_table_d2,)
Expand All @@ -90,6 +92,6 @@ def saslprep(data: str, prohibit_unassigned_code_points: bool = True) -> str:
# RFC3454 section 2, step 3 and 4 - Prohibit and check bidi
for char in data:
if any(in_table(char) for in_table in prohibited):
raise ValueError("SASLprep: failed prohibited character check")
raise PDFValueError("SASLprep: failed prohibited character check")

return data
13 changes: 9 additions & 4 deletions pdfminer/ccitt.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
cast,
)

from .pdfexceptions import PDFException, PDFValueError


def get_bytes(data: bytes) -> Iterator[int]:
yield from data
Expand Down Expand Up @@ -331,13 +333,16 @@ class CCITTG4Parser(BitParser):
BitParser.add(UNCOMPRESSED, "T00000", "00000000011")
BitParser.add(UNCOMPRESSED, "T10000", "00000000010")

class EOFB(Exception):
class CCITTException(PDFException):
pass

class EOFB(CCITTException):
pass

class InvalidData(Exception):
class InvalidData(CCITTException):
pass

class ByteSkip(Exception):
class ByteSkip(CCITTException):
pass

_color: int
Expand Down Expand Up @@ -584,7 +589,7 @@ def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
reversed = cast(bool, params.get("BlackIs1"))
parser = CCITTFaxDecoder(cols, bytealign=bytealign, reversed=reversed)
else:
raise ValueError(K)
raise PDFValueError(K)
parser.feedbytes(data)
return parser.close()

Expand Down
8 changes: 4 additions & 4 deletions pdfminer/cmapdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,21 @@
Set,
)

from pdfminer.pdfexceptions import PDFException, PDFTypeError
from .encodingdb import name2unicode
from .psparser import KWD
from .psparser import PSEOF
from pdfminer.psexceptions import PSEOF, PSSyntaxError
from .psparser import PSKeyword
from .psparser import PSLiteral
from .psparser import PSStackParser
from .psparser import PSSyntaxError
from .psparser import literal_name
from .utils import choplist
from .utils import nunpack

log = logging.getLogger(__name__)


class CMapError(Exception):
class CMapError(PDFException):
pass


Expand Down Expand Up @@ -202,7 +202,7 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None:
elif isinstance(code, int):
unichr = chr(code)
else:
raise TypeError(code)
raise PDFTypeError(code)

# A0 = non-breaking space, some weird fonts can have a collision on a cid here.
if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ":
Expand Down
Loading
Loading