diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..658f079091b24c3afa4dcd4c72d507bf986d4b08 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +demo/demo2.pdf filter=lfs diff=lfs merge=lfs -text diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000000000000000000000000000000000000..8bb192647da8edf8b7991467a40a2f52d64af052 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,85 @@ +name: Bug Report | 反馈 Bug +description: Create a bug report for MinerU | MinerU 的 Bug 反馈 +labels: bug + +# We omit `title: "..."` so that the field defaults to blank. If we set it to +# empty string, Github seems to reject this .yml file. + +body: + + - type: textarea + id: description + attributes: + label: Description of the bug | 错误描述 + description: | + A clear and concise description of the bug. | 简单描述遇到的问题 + + validations: + required: true + + - type: textarea + id: reproduce + attributes: + label: How to reproduce the bug | 如何复现 + + # Should not word-wrap this description here. + description: | + * Explain the steps required to reproduce the bug. | 说明复现此错误所需的步骤。 + * Include required code snippets, example files, etc. | 包含必要的代码片段、示例文件等。 + * Describe what you expected to happen (if not obvious). | 描述你期望发生的情况。 + * If applicable, add screenshots to help explain the problem. | 添加截图以帮助解释问题。 + * Include any other information that could be relevant, for example information about the Python environment. | 包括任何其他可能相关的信息。 + + For problems when building or installing MinerU: | 在构建或安装 MinerU 时遇到的问题: + * Give the **exact** build/install commands that were run. | 提供**确切**的构建/安装命令。 + * Give the **complete** output from these commands. | 提供这些命令的**完整**输出。 + + validations: + required: true + +# - type: markdown +# attributes: +# value: | +# # The information below is required. + + + - type: dropdown + id: os_name + attributes: + label: Operating system | 操作系统 + #multiple: true + options: + - + - Windows + - Linux + - MacOS + validations: + required: true + + - type: dropdown + id: python_version + attributes: + label: Python version | Python 版本 + #multiple: true + # Need quotes around `3.10` otherwise it is treated as a number and shows as `3.1`. + options: + - + - "3.12" + - "3.11" + - "3.10" + - "3.9" + validations: + required: true + + - type: dropdown + id: device_mode + attributes: + label: Device mode | 设备模式 + #multiple: true + options: + - + - cpu + - cuda + - mps + validations: + required: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000000000000000000000000000000000..8eee8d403ed164e542db56d4c9ac879460aeb2dd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,28 @@ +--- +name: Feature request | 功能需求 +about: Suggest an idea for this project | 提出一个有价值的idea +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +**您的特性请求是否与某个问题相关?请描述。** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] +对存在的问题进行清晰且简洁的描述。例如:我一直很困扰的是 [...] + +**Describe the solution you'd like** +**描述您期望的解决方案** +A clear and concise description of what you want to happen. +清晰且简洁地描述您希望实现的内容。 + +**Describe alternatives you've considered** +**描述您已考虑的替代方案** +A clear and concise description of any alternative solutions or features you've considered. +清晰且简洁地描述您已经考虑过的任何替代解决方案。 + +**Additional context** +**提供更多细节** +Add any other context or screenshots about the feature request here. +请附上任何相关截图、链接或文件,以帮助我们更好地理解您的请求。 \ No newline at end of file diff --git a/.github/workflows/cla.yml b/.github/workflows/cla.yml new file mode 100644 index 0000000000000000000000000000000000000000..f37e6db08a6132d24a32b09d27143b00c7396acf --- /dev/null +++ b/.github/workflows/cla.yml @@ -0,0 +1,43 @@ +name: "CLA Assistant" +on: + issue_comment: + types: [created] + pull_request_target: + types: [opened,closed,synchronize] + +# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings +permissions: + actions: write + contents: write # this can be 'read' if the signatures are in remote repository + pull-requests: write + statuses: write + +jobs: + CLAAssistant: + runs-on: ubuntu-latest + steps: + - name: "CLA Assistant" + if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target' + uses: contributor-assistant/github-action@v2.4.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # the below token should have repo scope and must be manually added by you in the repository's secret + # This token is required only if you have configured to store the signatures in a remote repository/organization + PERSONAL_ACCESS_TOKEN: ${{ secrets.RELEASE_TOKEN }} + with: + path-to-signatures: 'signatures/version1/cla.json' + path-to-document: 'https://github.com/cla-assistant/github-action/blob/master/SAPCLA.md' # e.g. a CLA or a DCO document + # branch should not be protected + branch: 'main' + allowlist: user1,bot* + + # the followings are the optional inputs - If the optional inputs are not given, then default values will be taken + #remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository) + #remote-repository-name: enter the remote repository name where the signatures should be stored (Default is storing the signatures in the same repository) + #create-file-commit-message: 'For example: Creating file for storing CLA Signatures' + #signed-commit-message: 'For example: $contributorName has signed the CLA in $owner/$repo#$pullRequestNo' + #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign' + #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA' + #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.' + #lock-pullrequest-aftermerge: false - if you don't want this bot to automatically lock the pull request after merging (default - true) + #use-dco-flag: true - If you are using DCO instead of CLA diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml new file mode 100644 index 0000000000000000000000000000000000000000..5e174cbcd6ae5769829d28a3613734287826d0d4 --- /dev/null +++ b/.github/workflows/cli.yml @@ -0,0 +1,46 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: mineru +on: + push: + branches: + - "master" + paths-ignore: + - "cmds/**" + - "**.md" + pull_request: + branches: + - "master" + paths-ignore: + - "cmds/**" + - "**.md" + workflow_dispatch: +jobs: + cli-test: + runs-on: ubuntu-latest + timeout-minutes: 40 + strategy: + fail-fast: true + + steps: + - name: PDF cli + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - name: check-requirements + run: | + pip install -r requirements.txt + pip install -r requirements-qa.txt + pip install magic-pdf + - name: test_cli + run: | + cp magic-pdf.template.json ~/magic-pdf.json + echo $GITHUB_WORKSPACE + cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py + cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_cli.py + + - name: benchmark + run: | + cd $GITHUB_WORKSPACE && pytest -s -v tests/test_cli/test_bench.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 0000000000000000000000000000000000000000..e1e8f99cb9f0682b1c93b80fba5f2f498b191538 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,126 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Python package + +on: + push: + tags: + - '*released' + workflow_dispatch: + + +jobs: + + update-version: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: master + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Update version.py + run: | + python update_version.py + + - name: Verify version.py + run: | + ls -l magic_pdf/libs/version.py + cat magic_pdf/libs/version.py + + - name: Commit changes + run: | + git config --local user.email "moe@myhloli.com" + git config --local user.name "myhloli" + git add magic_pdf/libs/version.py + if git diff-index --quiet HEAD; then + echo "No changes to commit" + else + git commit -m "Update version.py with new version" + fi + id: commit_changes + + - name: Push changes + if: steps.commit_changes.outcome == 'success' + env: + GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} + run: | + git push origin HEAD:master + + build: + needs: [ update-version ] + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.10"] + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: master + fetch-depth: 0 + + - name: Verify version.py + run: | + ls -l magic_pdf/libs/version.py + cat magic_pdf/libs/version.py + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Install wheel + run: | + python -m pip install wheel + + - name: Build wheel + run: | + python setup.py bdist_wheel + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: wheel-file + path: dist/*.whl + retention-days: 30 + + release: + needs: [ build ] + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Download artifact + uses: actions/download-artifact@v4 + with: + name: wheel-file + path: dist + + - name: Create and Upload Release + id: create_release + uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981 + with: + files: './dist/*.whl' + env: + GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} + + - name: Publish distribution to PyPI + run: | + pip install twine + twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/rerun.yml b/.github/workflows/rerun.yml new file mode 100644 index 0000000000000000000000000000000000000000..b607019a959544c1b09f7b39c3e894725eb4e0a7 --- /dev/null +++ b/.github/workflows/rerun.yml @@ -0,0 +1,23 @@ +name: check-status + +on: + workflow_run: + workflows: [ci] + types: [completed] + +jobs: + on-failure: + runs-on: pdf + permissions: + actions: write + if: ${{ (github.event.workflow_run.head_branch == 'master') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }} + steps: + - run: | + echo 'The triggering workflow failed' + sleep 600 + curl -L \ + -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ github.token }}" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs diff --git a/.github/workflows/update_base.yml b/.github/workflows/update_base.yml new file mode 100644 index 0000000000000000000000000000000000000000..bce75a6c2dc0ce64e0cc38f3e8bbb26d907dc289 --- /dev/null +++ b/.github/workflows/update_base.yml @@ -0,0 +1,22 @@ +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: update-base +on: + push: + tags: + - '*released' + workflow_dispatch: +jobs: + pdf-test: + runs-on: pdf + timeout-minutes: 40 + + + steps: + - name: update-base + uses: actions/checkout@v3 + - name: start-update + run: | + echo "start test" + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6de528ed7be16aad90fbf58df5a32fc2850641db --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +*.tar +*.tar.gz +venv*/ +envs/ +slurm_logs/ + +sync1.sh +data_preprocess_pj1 +data-preparation1 +__pycache__ +*.log +*.pyc +.vscode +debug/ +*.ipynb +.idea + +# vscode history +.history + +.DS_Store +.env + +bad_words/ +bak/ + +app/tests/* +temp/ +tmp/ +tmp +.vscode +.vscode/ +/tests/ +ocr_demo + +/app/common/__init__.py +/magic_pdf/config/__init__.py diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..0ad25db4bd1d86c452db3f9602ccdbe172438f52 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/README.md b/README.md index 3152f252170d13a901054ac8caf073570320353a..0e883a3420128e650e9d0a77f3e618e4ef4a17f5 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,292 @@ --- title: MinerU -emoji: 📈 -colorFrom: red -colorTo: pink +app_file: ./demo/app.py sdk: gradio sdk_version: 4.39.0 -app_file: app.py -pinned: false --- +
+
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU) +[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU) +[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues) +[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues) +[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf) +[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf) +[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf) + + + + +[English](README.md) | [简体中文](README_zh-CN.md) + +
+ +
+ +
+ +# MinerU + + +## Introduction + +MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features: + +- [Magic-PDF](#Magic-PDF) PDF Document Extraction +- [Magic-Doc](#Magic-Doc) Webpage & E-book Extraction + + +# Magic-PDF + + +## Introduction + +Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol. + +Key features include: + +- Support for multiple front-end model inputs +- Removal of headers, footers, footnotes, and page numbers +- Human-readable layout formatting +- Retains the original document's structure and formatting, including headings, paragraphs, lists, and more +- Extraction and display of images and tables within markdown +- Conversion of equations into LaTeX format +- Automatic detection and conversion of garbled PDFs +- Compatibility with CPU and GPU environments +- Available for Windows, Linux, and macOS platforms + + +https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070 + + + +## Project Panorama + +![Project Panorama](docs/images/project_panorama_en.png) + + +## Flowchart + +![Flowchart](docs/images/flowchart_en.png) + +### Dependency repositorys + +- [PDF-Extract-Kit : A Comprehensive Toolkit for High-Quality PDF Content Extraction](https://github.com/opendatalab/PDF-Extract-Kit) 🚀🚀🚀 + +## Getting Started + +### Requirements + +- Python >= 3.9 + +Using a virtual environment is recommended to avoid potential dependency conflicts; both venv and conda are suitable. +For example: +```bash +conda create -n MinerU python=3.10 +conda activate MinerU +``` + +### Installation and Configuration + +#### 1. Install Magic-PDF + +Install the full-feature package with pip: +>Note: The pip-installed package supports CPU-only and is ideal for quick tests. +> +>For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS). + +```bash +pip install magic-pdf[full-cpu] +``` +The full-feature package depends on detectron2, which requires a compilation installation. +If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114 +Alternatively, you can directly use our precompiled whl package (limited to Python 3.10): + +```bash +pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ +``` + + +#### 2. Downloading model weights files + +For detailed references, please see below [how_to_download_models](docs/how_to_download_models_en.md) + +After downloading the model weights, move the 'models' directory to a directory on a larger disk space, preferably an SSD. + + +#### 3. Copy the Configuration File and Make Configurations +You can get the [magic-pdf.template.json](magic-pdf.template.json) file in the repository root directory. +```bash +cp magic-pdf.template.json ~/magic-pdf.json +``` +In magic-pdf.json, configure "models-dir" to point to the directory where the model weights files are located. + +```json +{ + "models-dir": "/tmp/models" +} +``` + + +#### 4. Acceleration Using CUDA or MPS +If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you can leverage acceleration with CUDA or MPS respectively. +##### CUDA + +You need to install the corresponding PyTorch version according to your CUDA version. +This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/ +```bash +pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118 +``` +Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json. +```json +{ + "device-mode":"cuda" +} +``` + +##### MPS + +For macOS users with M-series chip devices, you can use MPS for inference acceleration. +You also need to modify the value of "device-mode" in the configuration file magic-pdf.json. +```json +{ + "device-mode":"mps" +} +``` + + +### Usage + +#### 1.Usage via Command Line + +###### simple + +```bash +magic-pdf pdf-command --pdf "pdf_path" --inside_model true +``` +After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf". +You can find the corresponding xxx_model.json file in the markdown directory. +If you intend to do secondary development on the post-processing pipeline, you can use the command: +```bash +magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path" +``` +In this way, you won't need to re-run the model data, making debugging more convenient. + + +###### more + +```bash +magic-pdf --help +``` + + +#### 2. Usage via Api + +###### Local +```python +image_writer = DiskReaderWriter(local_image_dir) +image_dir = str(os.path.basename(local_image_dir)) +jso_useful_key = {"_pdf_type": "", "model_list": []} +pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) +pipe.pipe_classify() +pipe.pipe_parse() +md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") +``` + +###### Object Storage +```python +s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint) +image_dir = "s3://img_bucket/" +s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir) +pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN) +jso_useful_key = {"_pdf_type": "", "model_list": []} +pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli) +pipe.pipe_classify() +pipe.pipe_parse() +md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") +``` + +Demo can be referred to [demo.py](demo/demo.py) + + +# Magic-Doc + + +## Introduction + +Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format. + +Key Features Include: + +- Web Page Extraction + - Cross-modal precise parsing of text, images, tables, and formula information. + +- E-Book Document Extraction + - Supports various document formats including epub, mobi, with full adaptation for text and images. + +- Language Type Identification + - Accurate recognition of 176 languages. + +https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca + + + +https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d + + + +https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2 + + + + +## Project Repository + +- [Magic-Doc](https://github.com/InternLM/magic-doc) + Outstanding Webpage and E-book Extraction Tool + + +# All Thanks To Our Contributors + + + + + + +# License Information + +[LICENSE.md](LICENSE.md) + +The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility. + + +# Acknowledgments + +- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) +- [PyMuPDF](https://github.com/pymupdf/PyMuPDF) +- [fast-langdetect](https://github.com/LlmKira/fast-langdetect) +- [pdfminer.six](https://github.com/pdfminer/pdfminer.six) + + +# Citation + +```bibtex +@misc{2024mineru, + title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool}, + author={MinerU Contributors}, + howpublished = {\url{https://github.com/opendatalab/MinerU}}, + year={2024} +} +``` + + +# Star History + + + + + + Star History Chart + + diff --git a/README_zh-CN.md b/README_zh-CN.md new file mode 100644 index 0000000000000000000000000000000000000000..ed36ae7e11a16065d7628d230678c075fc61fc5b --- /dev/null +++ b/README_zh-CN.md @@ -0,0 +1,277 @@ +
+
+ +[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU) +[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU) +[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues) +[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues) +[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf) +[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf) +[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf) + +[English](README.md) | [简体中文](README_zh-CN.md) + +
+ +
+ +
+ +# MinerU + + +## 简介 + +MinerU 是一款一站式、开源、高质量的数据提取工具,主要包含以下功能: + +- [Magic-PDF](#Magic-PDF) PDF文档提取 +- [Magic-Doc](#Magic-Doc) 网页与电子书提取 + +# Magic-PDF + + +## 简介 + +Magic-PDF 是一款将 PDF 转化为 markdown 格式的工具。支持转换本地文档或者位于支持S3协议对象存储上的文件。 + +主要功能包含 + +- 支持多种前端模型输入 +- 删除页眉、页脚、脚注、页码等元素 +- 符合人类阅读顺序的排版格式 +- 保留原文档的结构和格式,包括标题、段落、列表等 +- 提取图像和表格并在markdown中展示 +- 将公式转换成latex +- 乱码PDF自动识别并转换 +- 支持cpu和gpu环境 +- 支持windows/linux/mac平台 + + +https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070 + + + +## 项目全景 + +![项目全景图](docs/images/project_panorama_zh_cn.png) + +## 流程图 + +![流程图](docs/images/flowchart_zh_cn.png) + +### 子模块仓库 + +- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) + - 高质量的PDF内容提取工具包 + +## 上手指南 + +### 配置要求 + +python >= 3.9 + +推荐使用虚拟环境,以避免可能发生的依赖冲突,venv和conda均可使用。 +例如: +```bash +conda create -n MinerU python=3.10 +conda activate MinerU +``` +开发基于python 3.10,如果在其他版本python出现问题请切换至3.10。 + +### 安装配置 + +#### 1. 安装Magic-PDF + +使用pip安装完整功能包: +>受pypi限制,pip安装的完整功能包仅支持cpu推理,建议只用于快速测试解析能力。 +> +>如需在生产环境使用CUDA/MPS加速请参考[使用CUDA或MPS加速推理](#4-使用CUDA或MPS加速推理) +```bash +pip install magic-pdf[full-cpu] +``` +完整功能包依赖detectron2,该库需要编译安装,如需自行编译,请参考 https://github.com/facebookresearch/detectron2/issues/5114 +或是直接使用我们预编译的whl包(仅限python 3.10): +```bash +pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/ +``` + +#### 2. 下载模型权重文件 + +详细参考 [如何下载模型文件](docs/how_to_download_models_zh_cn.md) +下载后请将models目录移动到空间较大的ssd磁盘目录 + +#### 3. 拷贝配置文件并进行配置 +在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件 +```bash +cp magic-pdf.template.json ~/magic-pdf.json +``` +在magic-pdf.json中配置"models-dir"为模型权重文件所在目录 +```json +{ + "models-dir": "/tmp/models" +} +``` + +#### 4. 使用CUDA或MPS加速推理 +如您有可用的Nvidia显卡或在使用Apple Silicon的Mac,可以使用CUDA或MPS进行加速 +##### CUDA + +需要根据自己的CUDA版本安装对应的pytorch版本 +以下是对应CUDA 11.8版本的安装命令,更多信息请参考 https://pytorch.org/get-started/locally/ +```bash +pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118 +``` + +同时需要修改配置文件magic-pdf.json中"device-mode"的值 +```json +{ + "device-mode":"cuda" +} +``` + +##### MPS +使用macOS(M系列芯片设备)可以使用MPS进行推理加速 +需要修改配置文件magic-pdf.json中"device-mode"的值 +```json +{ + "device-mode":"mps" +} +``` + + +### 使用说明 + +#### 1. 通过命令行使用 + +###### 直接使用 + +```bash +magic-pdf pdf-command --pdf "pdf_path" --inside_model true +``` +程序运行完成后,你可以在"/tmp/magic-pdf"目录下看到生成的markdown文件,markdown目录中可以找到对应的xxx_model.json文件 +如果您有意对后处理pipeline进行二次开发,可以使用命令 +```bash +magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path" +``` +这样就不需要重跑模型数据,调试起来更方便 + +###### 更多用法 + +```bash +magic-pdf --help +``` + + +#### 2. 通过接口调用 + +###### 本地使用 +```python +image_writer = DiskReaderWriter(local_image_dir) +image_dir = str(os.path.basename(local_image_dir)) +jso_useful_key = {"_pdf_type": "", "model_list": model_json} +pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) +pipe.pipe_classify() +pipe.pipe_parse() +md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") +``` + +###### 在对象存储上使用 +```python +s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint) +image_dir = "s3://img_bucket/" +s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir) +pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN) +jso_useful_key = {"_pdf_type": "", "model_list": model_json} +pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli) +pipe.pipe_classify() +pipe.pipe_parse() +md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") +``` + +详细实现可参考 [demo.py](demo/demo.py) + + +### 常见问题处理解答 + +参考 [FAQ](docs/FAQ_zh_cn.md) + + +# Magic-Doc + + +## 简介 + +Magic-Doc 是一款支持将网页或多格式电子书转换为 markdown 格式的工具。 + +主要功能包含 + +- Web网页提取 + - 跨模态精准解析图文、表格、公式信息 + +- 电子书文献提取 + - 支持 epub,mobi等多格式文献,文本图片全适配 + +- 语言类型鉴定 + - 支持176种语言的准确识别 + +https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca + + + +https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d + + + +https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2 + + + + +## 项目仓库 + +- [Magic-Doc](https://github.com/InternLM/magic-doc) + 优秀的网页与电子书提取工具 + + +## 感谢我们的贡献者 + + + + + + +## 版权说明 + +[LICENSE.md](LICENSE.md) + +本项目目前采用PyMuPDF以实现高级功能,但因其遵循AGPL协议,可能对某些使用场景构成限制。未来版本迭代中,我们计划探索并替换为许可条款更为宽松的PDF处理库,以提升用户友好度及灵活性。 + + +## 致谢 +- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR) +- [PyMuPDF](https://github.com/pymupdf/PyMuPDF) +- [fast-langdetect](https://github.com/LlmKira/fast-langdetect) +- [pdfminer.six](https://github.com/pdfminer/pdfminer.six) + + +# 引用 + +```bibtex +@misc{2024mineru, + title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool}, + author={MinerU Contributors}, + howpublished = {\url{https://github.com/opendatalab/MinerU}}, + year={2024} +} +``` + + +# Star History + + + + + + Star History Chart + + \ No newline at end of file diff --git a/demo/app.py b/demo/app.py new file mode 100644 index 0000000000000000000000000000000000000000..83cbaf909de88fa82486a3e30f37f396fa99471b --- /dev/null +++ b/demo/app.py @@ -0,0 +1,67 @@ +import os +import json +import gradio as gr +from loguru import logger +from magic_pdf.pipe.UNIPipe import UNIPipe +from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter +import magic_pdf.model as model_config + +model_config.__use_inside_model__ = True + + +def process_pdf(file_path): + try: + pdf_bytes = open(file_path, "rb").read() + model_json = [] # model_json传空list使用内置模型解析 + jso_useful_key = {"_pdf_type": "", "model_list": model_json} + local_image_dir = os.path.join('uploads', 'images') + if not os.path.exists(local_image_dir): + os.makedirs(local_image_dir) + image_dir = str(os.path.basename(local_image_dir)) + image_writer = DiskReaderWriter(local_image_dir) + pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) + pipe.pipe_classify() + if len(model_json) == 0: + if model_config.__use_inside_model__: + pipe.pipe_analyze() + else: + logger.error("need model list input") + return None + pipe.pipe_parse() + md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") + return md_content + except Exception as e: + logger.exception(e) + return None + + +def extract_markdown_from_pdf(pdf): + # 保存上传的PDF文件 + file_path = os.path.join('uploads', pdf.name) + with open(file_path, 'wb') as f: + f.write(pdf.read()) + + # 处理PDF文件并生成Markdown内容 + md_content = process_pdf(file_path) + return md_content + + +def main(): + # 创建Gradio接口 + with gr.Blocks() as demo: + gr.Markdown("# PDF to Markdown Converter") + + with gr.Row(): + with gr.Column(): + pdf_file = gr.File(label="Upload PDF", file_types=['.pdf']) + md_output = gr.Markdown(label="Extracted Markdown") + + extract_button = gr.Button("Extract Markdown") + extract_button.click(extract_markdown_from_pdf, inputs=[ + pdf_file], outputs=[md_output]) + + demo.launch(share=True) + + +if __name__ == '__main__': + main() diff --git a/demo/demo.py b/demo/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..35d956322990a0246b92ad9d6434967a64e61ab1 --- /dev/null +++ b/demo/demo.py @@ -0,0 +1,31 @@ +import os +import json + +from loguru import logger + +from magic_pdf.pipe.UNIPipe import UNIPipe +from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter + +import magic_pdf.model as model_config +model_config.__use_inside_model__ = True + +try: + current_script_dir = os.path.dirname(os.path.abspath(__file__)) + demo_name = "demo1" + pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf") + model_path = os.path.join(current_script_dir, f"{demo_name}.json") + pdf_bytes = open(pdf_path, "rb").read() + # model_json = json.loads(open(model_path, "r", encoding="utf-8").read()) + model_json = [] # model_json传空list使用内置模型解析 + jso_useful_key = {"_pdf_type": "", "model_list": model_json} + local_image_dir = os.path.join(current_script_dir, 'images') + image_dir = str(os.path.basename(local_image_dir)) + image_writer = DiskReaderWriter(local_image_dir) + pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer) + pipe.pipe_classify() + pipe.pipe_parse() + md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none") + with open(f"{demo_name}.md", "w", encoding="utf-8") as f: + f.write(md_content) +except Exception as e: + logger.exception(e) \ No newline at end of file diff --git a/demo/demo1.json b/demo/demo1.json new file mode 100644 index 0000000000000000000000000000000000000000..e3b5a30aab7edc2779b083257a0e0cf1473c84d9 --- /dev/null +++ b/demo/demo1.json @@ -0,0 +1 @@ +[{"layout_dets": [{"category_id": 2, "poly": [117.85147857666016, 198.19203186035156, 268.09375, 198.19203186035156, 268.09375, 365.4513854980469, 117.85147857666016, 365.4513854980469], "score": 1.0}, {"category_id": 2, "poly": [516.9244995117188, 193.8611297607422, 983.7249145507812, 193.8611297607422, 983.7249145507812, 288.566650390625, 516.9244995117188, 288.566650390625], "score": 0.9999980926513672}, {"category_id": 2, "poly": [119.0521469116211, 1793.3775634765625, 774.3035888671875, 1793.3775634765625, 774.3035888671875, 1842.8583984375, 119.0521469116211, 1842.8583984375], "score": 0.9999951720237732}, {"category_id": 1, "poly": [213.19744873046875, 621.9070434570312, 1290.4381103515625, 621.9070434570312, 1290.4381103515625, 733.4085693359375, 213.19744873046875, 733.4085693359375], "score": 0.9999936819076538}, {"category_id": 1, "poly": [390.47998046875, 751.6647338867188, 1108.0994873046875, 751.6647338867188, 1108.0994873046875, 774.5253295898438, 390.47998046875, 774.5253295898438], "score": 0.9999909400939941}, {"category_id": 2, "poly": [556.6760864257812, 343.6651306152344, 942.158447265625, 343.6651306152344, 942.158447265625, 368.6150207519531, 556.6760864257812, 368.6150207519531], "score": 0.9999899864196777}, {"category_id": 0, "poly": [245.8207244873047, 472.72943115234375, 1257.65380859375, 472.72943115234375, 1257.65380859375, 520.0311889648438, 245.8207244873047, 520.0311889648438], "score": 0.9999768137931824}, {"category_id": 2, "poly": [1119.6229248046875, 199.3274383544922, 1376.630859375, 199.3274383544922, 1376.630859375, 384.0538024902344, 1119.6229248046875, 384.0538024902344], "score": 0.9999668002128601}, {"category_id": 1, "poly": [118.14305114746094, 1571.5140380859375, 864.8477172851562, 1571.5140380859375, 864.8477172851562, 1594.3565673828125, 118.14305114746094, 1594.3565673828125], "score": 0.999945342540741}, {"category_id": 0, "poly": [118.69384002685547, 862.561767578125, 209.67910766601562, 862.561767578125, 209.67910766601562, 888.9332885742188, 118.69384002685547, 888.9332885742188], "score": 0.9999412298202515}, {"category_id": 1, "poly": [239.3308868408203, 550.2936401367188, 1257.6968994140625, 550.2936401367188, 1257.6968994140625, 596.7587280273438, 239.3308868408203, 596.7587280273438], "score": 0.9999355673789978}, {"category_id": 2, "poly": [117.71773529052734, 1687.8800048828125, 1379.2835693359375, 1687.8800048828125, 1379.2835693359375, 1766.3516845703125, 117.71773529052734, 1766.3516845703125], "score": 0.999925971031189}, {"category_id": 1, "poly": [115.68157958984375, 913.7571411132812, 1385.33837890625, 913.7571411132812, 1385.33837890625, 1533.5689697265625, 115.68157958984375, 1533.5689697265625], "score": 0.999893307685852}, {"category_id": 2, "poly": [1084.155517578125, 374.07135009765625, 1378.12109375, 374.07135009765625, 1378.12109375, 396.0621032714844, 1084.155517578125, 396.0621032714844], "score": 0.9371034502983093}, {"category_id": 13, "poly": [714, 1383, 767, 1383, 767, 1411, 714, 1411], "score": 0.89, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [571, 1351, 636, 1351, 636, 1380, 571, 1380], "score": 0.87, "latex": "(N_{\\mathrm{zero}})"}, {"category_id": 13, "poly": [398, 1793, 419, 1793, 419, 1815, 398, 1815], "score": 0.75, "latex": "\\copyright"}, {"category_id": 13, "poly": [116, 1509, 140, 1509, 140, 1533, 116, 1533], "score": 0.73, "latex": "\\copyright"}, {"category_id": 13, "poly": [315, 1713, 479, 1713, 479, 1739, 315, 1739], "score": 0.36, "latex": "+61\\;3\\;9450\\;8719"}, {"category_id": 13, "poly": [148, 1743, 166, 1743, 166, 1765, 148, 1765], "score": 0.35, "latex": "E"}, {"category_id": 13, "poly": [369, 1743, 387, 1743, 387, 1764, 369, 1764], "score": 0.26, "latex": "@"}, {"category_id": 15, "poly": [120.0, 338.0, 266.0, 338.0, 266.0, 374.0, 120.0, 374.0], "score": 1.0, "text": "ELSEVIER"}, {"category_id": 15, "poly": [515.0, 194.0, 986.0, 194.0, 986.0, 224.0, 515.0, 224.0], "score": 0.99, "text": "Available online at www.sciencedirect.com"}, {"category_id": 15, "poly": [599.0, 245.0, 728.0, 245.0, 728.0, 275.0, 599.0, 275.0], "score": 0.99, "text": "SCIENCE"}, {"category_id": 15, "poly": [712.0, 237.0, 905.0, 229.0, 907.0, 281.0, 714.0, 289.0], "score": 0.77, "text": "CDIRECT."}, {"category_id": 15, "poly": [116.0, 1819.0, 427.0, 1819.0, 427.0, 1847.0, 116.0, 1847.0], "score": 0.99, "text": "doi:10.1016/j.jhydrol.2005.01.006"}, {"category_id": 15, "poly": [114.0, 1793.0, 397.0, 1793.0, 397.0, 1821.0, 114.0, 1821.0], "score": 0.96, "text": "0022-1694/$ - see front matter"}, {"category_id": 15, "poly": [420.0, 1793.0, 777.0, 1793.0, 777.0, 1821.0, 420.0, 1821.0], "score": 0.98, "text": " 2005 Elsevier B.V. All rights reserved."}, {"category_id": 15, "poly": [210.0, 624.0, 1291.0, 624.0, 1291.0, 654.0, 210.0, 654.0], "score": 0.97, "text": "aSchool of Forest and Ecosystem Studies,University of Melbourne,P.O.Box 137,Heidelberg,Victoria 3084,Australia"}, {"category_id": 15, "poly": [460.0, 647.0, 1040.0, 649.0, 1039.0, 679.0, 460.0, 677.0], "score": 0.96, "text": "bCSIRODivision of Land andWater,Canberra,ACT,Australia"}, {"category_id": 15, "poly": [369.0, 679.0, 1130.0, 679.0, 1130.0, 710.0, 369.0, 710.0], "score": 0.97, "text": "cCooperative Research Centre for Catchment Hydrology, Canberra,ACT, Australia"}, {"category_id": 15, "poly": [299.0, 701.0, 1203.0, 703.0, 1203.0, 740.0, 299.0, 737.0], "score": 0.98, "text": "dDepartment of Civil and Environmental Engineering, University of Melbourne, Victoria, Australia"}, {"category_id": 15, "poly": [389.0, 750.0, 1108.0, 750.0, 1108.0, 780.0, 389.0, 780.0], "score": 0.99, "text": "Received 1 October 2003; revised 22 December 2004; accepted 3 January 2005"}, {"category_id": 15, "poly": [554.0, 340.0, 945.0, 337.0, 945.0, 374.0, 554.0, 376.0], "score": 0.98, "text": "Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [247.0, 477.0, 1252.0, 477.0, 1252.0, 520.0, 247.0, 520.0], "score": 0.99, "text": "The response of flow duration curves to afforestation"}, {"category_id": 15, "poly": [1165.0, 212.0, 1285.0, 218.0, 1283.0, 256.0, 1164.0, 251.0], "score": 1.0, "text": "Journal"}, {"category_id": 15, "poly": [1171.0, 260.0, 1207.0, 260.0, 1207.0, 290.0, 1171.0, 290.0], "score": 0.84, "text": "of"}, {"category_id": 15, "poly": [1157.0, 290.0, 1379.0, 297.0, 1378.0, 351.0, 1155.0, 343.0], "score": 1.0, "text": "Hydrology"}, {"category_id": 15, "poly": [1164.0, 374.0, 1368.0, 374.0, 1368.0, 389.0, 1164.0, 389.0], "score": 0.73, "text": "nuriarnom/laonta/ihrdr"}, {"category_id": 15, "poly": [116.0, 1572.0, 868.0, 1572.0, 868.0, 1600.0, 116.0, 1600.0], "score": 0.99, "text": "Keywords: Afforestation; Flow duration curves; Flow reduction; Paired catchments"}, {"category_id": 15, "poly": [116.0, 862.0, 213.0, 862.0, 213.0, 894.0, 116.0, 894.0], "score": 1.0, "text": "Abstract"}, {"category_id": 15, "poly": [238.0, 557.0, 1254.0, 557.0, 1254.0, 600.0, 238.0, 600.0], "score": 0.94, "text": "Patrick N.J. Lanea,c,*, Alice E. Bestb,c,d, Klaus Hickelb;c, Lu Zhangbc"}, {"category_id": 15, "poly": [127.0, 1681.0, 1381.0, 1683.0, 1381.0, 1720.0, 127.0, 1718.0], "score": 0.98, "text": "* Corresponding author. Address: Forest Science Centre, Department of Sustainability and Environment, P.O. Box 137, Heidelberg, Vic."}, {"category_id": 15, "poly": [114.0, 1711.0, 314.0, 1714.0, 314.0, 1744.0, 114.0, 1741.0], "score": 0.97, "text": "3084,Australia.Tel.:"}, {"category_id": 15, "poly": [480.0, 1711.0, 702.0, 1714.0, 702.0, 1744.0, 480.0, 1741.0], "score": 0.93, "text": ";fax: +61 3 9450 8644."}, {"category_id": 15, "poly": [167.0, 1744.0, 368.0, 1744.0, 368.0, 1772.0, 167.0, 1772.0], "score": 1.0, "text": "mailaddress:patrickl"}, {"category_id": 15, "poly": [388.0, 1744.0, 657.0, 1744.0, 657.0, 1772.0, 388.0, 1772.0], "score": 1.0, "text": "unimelb.edu.au (P.N.J. Lane)."}, {"category_id": 15, "poly": [137.0, 912.0, 1385.0, 912.0, 1385.0, 948.0, 137.0, 948.0], "score": 0.98, "text": " The hydrologic effect of replacing pasture or other short crops with trees is reasonably well understood on a mean annual"}, {"category_id": 15, "poly": [116.0, 946.0, 1383.0, 946.0, 1383.0, 976.0, 116.0, 976.0], "score": 0.99, "text": "basis. The impact on fow regime, as described by the annual flow duration curve (FDC) is less certain. A method to assess the"}, {"category_id": 15, "poly": [114.0, 974.0, 1383.0, 974.0, 1383.0, 1010.0, 114.0, 1010.0], "score": 0.99, "text": "impact of plantation establishment on FDCs was developed. The starting point for the analyses was the assumption that rainfall"}, {"category_id": 15, "poly": [116.0, 1008.0, 1381.0, 1008.0, 1381.0, 1038.0, 116.0, 1038.0], "score": 0.99, "text": "and vegetation age are the principal drivers of evapotranspiration. A key objective was to remove the variability in the rainfall"}, {"category_id": 15, "poly": [116.0, 1041.0, 1381.0, 1041.0, 1381.0, 1071.0, 116.0, 1071.0], "score": 0.99, "text": "signal, leaving changes in streamflow solely attributable to the evapotranspiration of the plantation. A method was developed to"}, {"category_id": 15, "poly": [116.0, 1073.0, 1381.0, 1073.0, 1381.0, 1103.0, 116.0, 1103.0], "score": 0.98, "text": "(1) fit a model to the observed annual time series of FDC percentiles; i.e. 1oth percentile for each year of record with annual"}, {"category_id": 15, "poly": [114.0, 1101.0, 1381.0, 1103.0, 1381.0, 1133.0, 114.0, 1131.0], "score": 0.99, "text": "rainfall and plantation age as parameters, (2) replace the annual rainfall variation with the long term mean to obtain climate"}, {"category_id": 15, "poly": [118.0, 1135.0, 1383.0, 1135.0, 1383.0, 1165.0, 118.0, 1165.0], "score": 0.99, "text": "adjusted FDCs, and (3) quantify changes in FDC percentiles as plantations age. Data from 10 catchments from Australia, South"}, {"category_id": 15, "poly": [118.0, 1165.0, 1381.0, 1165.0, 1381.0, 1195.0, 118.0, 1195.0], "score": 0.99, "text": "Africa and New Zealand were used. The model was able to represent flow variation for the majority of percentiles at eight of the"}, {"category_id": 15, "poly": [114.0, 1191.0, 1383.0, 1193.0, 1383.0, 1230.0, 114.0, 1228.0], "score": 0.98, "text": "10 catchments, particularly for the 10-50th percentiles. The adjusted FDCs revealed variable patterns in flow reductions with"}, {"category_id": 15, "poly": [116.0, 1230.0, 1379.0, 1230.0, 1379.0, 1260.0, 116.0, 1260.0], "score": 0.98, "text": "two types of responses (groups) being identified. Group 1 catchments show a substantial increase in the number of zero fow"}, {"category_id": 15, "poly": [114.0, 1258.0, 1381.0, 1260.0, 1381.0, 1290.0, 114.0, 1288.0], "score": 0.98, "text": "days, with low flows being more affected than high flows. Group 2 catchments show a more uniform reduction in flows across"}, {"category_id": 15, "poly": [116.0, 1292.0, 1383.0, 1292.0, 1383.0, 1322.0, 116.0, 1322.0], "score": 0.98, "text": "all percentiles. The differences may be partly explained by storage characteristics. The modelled fow reductions were in accord"}, {"category_id": 15, "poly": [116.0, 1322.0, 1381.0, 1322.0, 1381.0, 1352.0, 116.0, 1352.0], "score": 1.0, "text": "with published results of paired catchment experiments. An additional analysis was performed to characterise the impact of"}, {"category_id": 15, "poly": [116.0, 1417.0, 1381.0, 1417.0, 1381.0, 1447.0, 116.0, 1447.0], "score": 1.0, "text": "in the occurrence of any given flow in response to afforestation. The methods used in this study proved satisfactory in removing"}, {"category_id": 15, "poly": [116.0, 1449.0, 1383.0, 1449.0, 1383.0, 1479.0, 116.0, 1479.0], "score": 0.99, "text": "the rainfall variability, and have added useful insight into the hydrologic impacts of plantation establishment. This approach"}, {"category_id": 15, "poly": [116.0, 1479.0, 1379.0, 1479.0, 1379.0, 1509.0, 116.0, 1509.0], "score": 0.99, "text": "provides a methodology for understanding catchment response to afforestation, where paired catchment data is not available."}, {"category_id": 15, "poly": [114.0, 1382.0, 713.0, 1387.0, 713.0, 1417.0, 114.0, 1413.0], "score": 0.98, "text": "when adjusted for climate, indicated a significant increase in"}, {"category_id": 15, "poly": [768.0, 1382.0, 1381.0, 1387.0, 1381.0, 1417.0, 768.0, 1413.0], "score": 0.98, "text": ".The zero flow day method could be used to determine change"}, {"category_id": 15, "poly": [116.0, 1354.0, 570.0, 1354.0, 570.0, 1385.0, 116.0, 1385.0], "score": 0.98, "text": "afforestation on the number of zero flow days"}, {"category_id": 15, "poly": [637.0, 1354.0, 1383.0, 1354.0, 1383.0, 1385.0, 637.0, 1385.0], "score": 0.99, "text": "for the catchments in group 1. This model performed particularly well, and"}, {"category_id": 15, "poly": [141.0, 1507.0, 541.0, 1509.0, 541.0, 1539.0, 141.0, 1537.0], "score": 0.98, "text": "2005 Elsevier B.V. All rights reserved."}, {"category_id": 15, "poly": [1080.0, 368.0, 1383.0, 365.0, 1383.0, 402.0, 1080.0, 404.0], "score": 0.99, "text": "www.elsevier.com/locate/jhydrol"}], "page_info": {"page_no": 0, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [130.931640625, 251.82516479492188, 312.8154296875, 251.82516479492188, 312.8154296875, 283.4620056152344, 130.931640625, 283.4620056152344], "score": 0.9999987483024597}, {"category_id": 4, "poly": [794.2171020507812, 763.5051879882812, 1396.4493408203125, 763.5051879882812, 1396.4493408203125, 818.8292236328125, 794.2171020507812, 818.8292236328125], "score": 0.9999982714653015}, {"category_id": 1, "poly": [130.19113159179688, 1017.6807861328125, 732.7059326171875, 1017.6807861328125, 732.7059326171875, 1849.8070068359375, 130.19113159179688, 1849.8070068359375], "score": 0.9999954104423523}, {"category_id": 1, "poly": [793.3727416992188, 1280.632568359375, 1397.07080078125, 1280.632568359375, 1397.07080078125, 1849.0452880859375, 793.3727416992188, 1849.0452880859375], "score": 0.9999947547912598}, {"category_id": 1, "poly": [793.5277099609375, 849.8186645507812, 1397.0140380859375, 849.8186645507812, 1397.0140380859375, 1280.6221923828125, 793.5277099609375, 1280.6221923828125], "score": 0.999994158744812}, {"category_id": 1, "poly": [130.5381317138672, 317.5604248046875, 731.9227905273438, 317.5604248046875, 731.9227905273438, 1015.91748046875, 130.5381317138672, 1015.91748046875], "score": 0.9999940395355225}, {"category_id": 2, "poly": [130.44467163085938, 194.42764282226562, 166.39125061035156, 194.42764282226562, 166.39125061035156, 215.1434783935547, 130.44467163085938, 215.1434783935547], "score": 0.999992847442627}, {"category_id": 2, "poly": [479.5857849121094, 195.1154022216797, 1045.4803466796875, 195.1154022216797, 1045.4803466796875, 218.7963104248047, 479.5857849121094, 218.7963104248047], "score": 0.99998939037323}, {"category_id": 3, "poly": [799.3821411132812, 256.1320495605469, 1390.73681640625, 256.1320495605469, 1390.73681640625, 742.4434204101562, 799.3821411132812, 742.4434204101562], "score": 0.9999882578849792}, {"category_id": 13, "poly": [984, 1180, 1065, 1180, 1065, 1211, 984, 1211], "score": 0.88, "latex": "<20\\%"}, {"category_id": 13, "poly": [128, 1415, 183, 1415, 183, 1445, 128, 1445], "score": 0.86, "latex": "95\\%"}, {"category_id": 13, "poly": [573, 618, 723, 618, 723, 649, 573, 649], "score": 0.67, "latex": "400\u2013500\\;\\mathrm{mm}"}, {"category_id": 15, "poly": [127.0, 249.0, 316.0, 254.0, 315.0, 291.0, 126.0, 286.0], "score": 1.0, "text": "1. Introduction"}, {"category_id": 15, "poly": [793.0, 765.0, 1394.0, 765.0, 1394.0, 793.0, 793.0, 793.0], "score": 0.98, "text": "Fig. 1. Annual flow duration curves of daily flows from Pine Creek,"}, {"category_id": 15, "poly": [793.0, 793.0, 999.0, 793.0, 999.0, 821.0, 793.0, 821.0], "score": 0.97, "text": "Australia, 1989-2000."}, {"category_id": 15, "poly": [161.0, 1017.0, 735.0, 1017.0, 735.0, 1054.0, 161.0, 1054.0], "score": 0.98, "text": "Zhang et al. (1999, 2001) developed simple and"}, {"category_id": 15, "poly": [127.0, 1051.0, 735.0, 1051.0, 735.0, 1088.0, 127.0, 1088.0], "score": 0.99, "text": "easily parameterised models to predict changes in"}, {"category_id": 15, "poly": [129.0, 1086.0, 730.0, 1086.0, 730.0, 1116.0, 129.0, 1116.0], "score": 0.99, "text": "mean annual fows following afforestation. However,"}, {"category_id": 15, "poly": [129.0, 1120.0, 732.0, 1120.0, 732.0, 1150.0, 129.0, 1150.0], "score": 0.98, "text": "there is a need to consider the annual flow regime as the"}, {"category_id": 15, "poly": [129.0, 1152.0, 732.0, 1152.0, 732.0, 1182.0, 129.0, 1182.0], "score": 0.99, "text": "relative changes in high and low flows may have"}, {"category_id": 15, "poly": [129.0, 1187.0, 730.0, 1187.0, 730.0, 1217.0, 129.0, 1217.0], "score": 0.98, "text": "considerable site specific and downstream impacts.."}, {"category_id": 15, "poly": [129.0, 1219.0, 732.0, 1219.0, 732.0, 1249.0, 129.0, 1249.0], "score": 0.99, "text": "Sikka et al. (2003) recently showed a change from"}, {"category_id": 15, "poly": [127.0, 1249.0, 734.0, 1247.0, 735.0, 1284.0, 127.0, 1286.0], "score": 1.0, "text": "grassland to Eucalyptus globulus plantations in India"}, {"category_id": 15, "poly": [129.0, 1284.0, 728.0, 1284.0, 728.0, 1314.0, 129.0, 1314.0], "score": 0.98, "text": "decreased alow flow index by a factor of two during the"}, {"category_id": 15, "poly": [127.0, 1316.0, 735.0, 1316.0, 735.0, 1352.0, 127.0, 1352.0], "score": 0.99, "text": "first rotation (9 years), and by 3.75 during the second"}, {"category_id": 15, "poly": [129.0, 1352.0, 732.0, 1352.0, 732.0, 1382.0, 129.0, 1382.0], "score": 1.0, "text": "rotation, with more subdued impact on peak flows. The"}, {"category_id": 15, "poly": [129.0, 1385.0, 732.0, 1385.0, 732.0, 1415.0, 129.0, 1415.0], "score": 0.99, "text": "index was defined as the 10 day average flow exceeded"}, {"category_id": 15, "poly": [125.0, 1447.0, 735.0, 1449.0, 734.0, 1486.0, 125.0, 1483.0], "score": 0.98, "text": "duration curves. Scott and Smith (1997) reported"}, {"category_id": 15, "poly": [129.0, 1486.0, 732.0, 1486.0, 732.0, 1516.0, 129.0, 1516.0], "score": 0.96, "text": "proportionally greater reductions in low fows"}, {"category_id": 15, "poly": [125.0, 1511.0, 737.0, 1514.0, 737.0, 1550.0, 125.0, 1548.0], "score": 0.98, "text": "(75-100th percentiles) than annual flows from South"}, {"category_id": 15, "poly": [127.0, 1548.0, 735.0, 1550.0, 734.0, 1580.0, 127.0, 1578.0], "score": 0.99, "text": "African research catchments under conversions from"}, {"category_id": 15, "poly": [125.0, 1582.0, 737.0, 1580.0, 737.0, 1617.0, 125.0, 1619.0], "score": 0.98, "text": " grass to pine and eucalypt plantations, while Bosch"}, {"category_id": 15, "poly": [129.0, 1619.0, 732.0, 1619.0, 732.0, 1649.0, 129.0, 1649.0], "score": 0.98, "text": "(1979) found the greatest reduction in seasonal flow"}, {"category_id": 15, "poly": [129.0, 1651.0, 732.0, 1651.0, 732.0, 1681.0, 129.0, 1681.0], "score": 0.98, "text": "from the summer wet season. Fahey and Jackson"}, {"category_id": 15, "poly": [125.0, 1679.0, 735.0, 1681.0, 734.0, 1718.0, 125.0, 1716.0], "score": 0.99, "text": "(1997) reported the reduction in peak flows was twice"}, {"category_id": 15, "poly": [129.0, 1718.0, 732.0, 1718.0, 732.0, 1748.0, 129.0, 1748.0], "score": 0.98, "text": "that of total flow and low flows for pine afforestation in"}, {"category_id": 15, "poly": [125.0, 1746.0, 732.0, 1748.0, 732.0, 1785.0, 125.0, 1782.0], "score": 0.98, "text": " New Zealand. The generalisations that can be drawn"}, {"category_id": 15, "poly": [129.0, 1784.0, 728.0, 1784.0, 728.0, 1815.0, 129.0, 1815.0], "score": 0.99, "text": "from annual analyses, where processes and hydrologic"}, {"category_id": 15, "poly": [127.0, 1819.0, 732.0, 1817.0, 732.0, 1847.0, 127.0, 1849.0], "score": 0.99, "text": "responses are to a certain extent integrated may not"}, {"category_id": 15, "poly": [184.0, 1415.0, 732.0, 1417.0, 732.0, 1447.0, 184.0, 1445.0], "score": 0.99, "text": "of the time, obtained from analysis of 10-day flow"}, {"category_id": 15, "poly": [823.0, 1277.0, 1400.0, 1279.0, 1400.0, 1316.0, 823.0, 1314.0], "score": 0.98, "text": " This paper presents the results of a project aimed at"}, {"category_id": 15, "poly": [793.0, 1316.0, 1398.0, 1316.0, 1398.0, 1346.0, 793.0, 1346.0], "score": 0.96, "text": "quantifying changes in annual fow regime of"}, {"category_id": 15, "poly": [793.0, 1350.0, 1398.0, 1350.0, 1398.0, 1380.0, 793.0, 1380.0], "score": 0.99, "text": "catchments following plantation establishment. The"}, {"category_id": 15, "poly": [793.0, 1385.0, 1398.0, 1385.0, 1398.0, 1415.0, 793.0, 1415.0], "score": 0.98, "text": "flow regime is represented by the flow duration curve"}, {"category_id": 15, "poly": [793.0, 1417.0, 1398.0, 1417.0, 1398.0, 1447.0, 793.0, 1447.0], "score": 0.99, "text": "(FDC). The key assumption was that rainfall and"}, {"category_id": 15, "poly": [793.0, 1451.0, 1396.0, 1451.0, 1396.0, 1481.0, 793.0, 1481.0], "score": 0.99, "text": "forest age are the principal drivers of evapotranspira-"}, {"category_id": 15, "poly": [788.0, 1481.0, 1400.0, 1481.0, 1400.0, 1518.0, 788.0, 1518.0], "score": 0.99, "text": "tion. For any generalisation of response of the FDC to"}, {"category_id": 15, "poly": [793.0, 1518.0, 1398.0, 1518.0, 1398.0, 1548.0, 793.0, 1548.0], "score": 0.99, "text": "vegetation change, the variation in the annual climate"}, {"category_id": 15, "poly": [790.0, 1550.0, 1398.0, 1550.0, 1398.0, 1580.0, 790.0, 1580.0], "score": 0.97, "text": "signal must be removed. The time-tested solution to"}, {"category_id": 15, "poly": [790.0, 1585.0, 1398.0, 1585.0, 1398.0, 1615.0, 790.0, 1615.0], "score": 1.0, "text": "this problem is the paired-catchment (control versus"}, {"category_id": 15, "poly": [790.0, 1617.0, 1398.0, 1617.0, 1398.0, 1647.0, 790.0, 1647.0], "score": 0.98, "text": "treatment) experiment. The benefits in such studies"}, {"category_id": 15, "poly": [793.0, 1651.0, 1396.0, 1651.0, 1396.0, 1681.0, 793.0, 1681.0], "score": 0.98, "text": "are manifold: unambiguous measures of trends,"}, {"category_id": 15, "poly": [790.0, 1686.0, 1392.0, 1686.0, 1392.0, 1716.0, 790.0, 1716.0], "score": 0.99, "text": "insights into the processes driving those trends,"}, {"category_id": 15, "poly": [793.0, 1716.0, 1400.0, 1716.0, 1400.0, 1752.0, 793.0, 1752.0], "score": 0.96, "text": "excellent opportunities for model parameterisation"}, {"category_id": 15, "poly": [793.0, 1750.0, 1394.0, 1750.0, 1394.0, 1780.0, 793.0, 1780.0], "score": 0.98, "text": "and validation. However these data are not readily"}, {"category_id": 15, "poly": [790.0, 1784.0, 1390.0, 1784.0, 1390.0, 1815.0, 790.0, 1815.0], "score": 0.99, "text": "available for the range of treamtments and environ-"}, {"category_id": 15, "poly": [790.0, 1817.0, 1396.0, 1817.0, 1396.0, 1847.0, 790.0, 1847.0], "score": 0.99, "text": " ments required. Consequently, the aims of this project"}, {"category_id": 15, "poly": [793.0, 851.0, 1398.0, 851.0, 1398.0, 882.0, 793.0, 882.0], "score": 0.99, "text": "apply on a seasonal or shorter scale. Further, the"}, {"category_id": 15, "poly": [788.0, 879.0, 1398.0, 882.0, 1398.0, 918.0, 788.0, 916.0], "score": 1.0, "text": " observed impacts of any land use change on flows may"}, {"category_id": 15, "poly": [788.0, 916.0, 1400.0, 916.0, 1400.0, 952.0, 788.0, 952.0], "score": 0.96, "text": "be exaggerated or understated depending on the"}, {"category_id": 15, "poly": [788.0, 948.0, 1400.0, 948.0, 1400.0, 985.0, 788.0, 985.0], "score": 0.99, "text": "prevailing climate. Observations of fow during"}, {"category_id": 15, "poly": [793.0, 985.0, 1398.0, 985.0, 1398.0, 1015.0, 793.0, 1015.0], "score": 0.98, "text": "extended wet or dry spells, or with high annual"}, {"category_id": 15, "poly": [793.0, 1017.0, 1398.0, 1017.0, 1398.0, 1047.0, 793.0, 1047.0], "score": 1.0, "text": "variability can obscure the real impacts. Fig. 1 plots"}, {"category_id": 15, "poly": [790.0, 1051.0, 1398.0, 1051.0, 1398.0, 1081.0, 790.0, 1081.0], "score": 0.98, "text": " annual FDCs over 12 years of plantation growth for one"}, {"category_id": 15, "poly": [793.0, 1084.0, 1398.0, 1084.0, 1398.0, 1114.0, 793.0, 1114.0], "score": 0.99, "text": "of the catchments used in this study, Pine Creek. The"}, {"category_id": 15, "poly": [786.0, 1114.0, 1400.0, 1116.0, 1400.0, 1152.0, 786.0, 1150.0], "score": 0.97, "text": " net change in flow is obscured by rainfall variability;"}, {"category_id": 15, "poly": [788.0, 1148.0, 1400.0, 1146.0, 1400.0, 1182.0, 788.0, 1185.0], "score": 1.0, "text": "e.g. the greatest change in the FDC is in 1996, with the"}, {"category_id": 15, "poly": [786.0, 1215.0, 1398.0, 1213.0, 1398.0, 1249.0, 786.0, 1251.0], "score": 0.99, "text": " compared with 2000, where there is substantially"}, {"category_id": 15, "poly": [788.0, 1249.0, 941.0, 1249.0, 941.0, 1279.0, 788.0, 1279.0], "score": 0.99, "text": "higher flows."}, {"category_id": 15, "poly": [788.0, 1180.0, 983.0, 1180.0, 983.0, 1217.0, 788.0, 1217.0], "score": 0.96, "text": "stream flowing"}, {"category_id": 15, "poly": [1066.0, 1180.0, 1400.0, 1180.0, 1400.0, 1217.0, 1066.0, 1217.0], "score": 0.96, "text": " of the time. This may be"}, {"category_id": 15, "poly": [161.0, 318.0, 728.0, 318.0, 728.0, 355.0, 161.0, 355.0], "score": 1.0, "text": "Widespread afforestation through plantation estab-"}, {"category_id": 15, "poly": [125.0, 348.0, 732.0, 350.0, 732.0, 387.0, 125.0, 385.0], "score": 1.0, "text": "lishment on non-forested land represents a potentially"}, {"category_id": 15, "poly": [129.0, 389.0, 732.0, 389.0, 732.0, 417.0, 129.0, 417.0], "score": 0.98, "text": "significant alteration of catchment evapotranspiration"}, {"category_id": 15, "poly": [129.0, 421.0, 730.0, 421.0, 730.0, 452.0, 129.0, 452.0], "score": 0.98, "text": "(ET). Using data collated from multiple catchment"}, {"category_id": 15, "poly": [129.0, 456.0, 732.0, 456.0, 732.0, 484.0, 129.0, 484.0], "score": 0.99, "text": "studies, researchers have demonstrated a consistent"}, {"category_id": 15, "poly": [125.0, 482.0, 737.0, 484.0, 737.0, 520.0, 125.0, 518.0], "score": 0.98, "text": " difference in ET between forests and grass or short "}, {"category_id": 15, "poly": [122.0, 518.0, 734.0, 516.0, 735.0, 553.0, 123.0, 555.0], "score": 0.99, "text": " crops, and the relationship between ET and rainfall on"}, {"category_id": 15, "poly": [127.0, 553.0, 732.0, 553.0, 732.0, 583.0, 127.0, 583.0], "score": 1.0, "text": "a mean annual basis (Holmes and Sinclair, 1986;"}, {"category_id": 15, "poly": [127.0, 585.0, 732.0, 585.0, 732.0, 621.0, 127.0, 621.0], "score": 0.99, "text": "Vertessy and Bessard, 1999; Zhang et al., 1999,"}, {"category_id": 15, "poly": [129.0, 654.0, 732.0, 654.0, 732.0, 684.0, 129.0, 684.0], "score": 0.99, "text": "there is an increasing divergence between forest and"}, {"category_id": 15, "poly": [125.0, 684.0, 734.0, 682.0, 735.0, 718.0, 125.0, 720.0], "score": 0.99, "text": "grassland ET (Zhang et al., 2001). Research from"}, {"category_id": 15, "poly": [127.0, 718.0, 732.0, 718.0, 732.0, 755.0, 127.0, 755.0], "score": 0.98, "text": "South Africa in particular has demonstrated flow"}, {"category_id": 15, "poly": [129.0, 755.0, 730.0, 755.0, 730.0, 785.0, 129.0, 785.0], "score": 1.0, "text": "reduction following afforestation with both pine and"}, {"category_id": 15, "poly": [125.0, 783.0, 732.0, 780.0, 732.0, 817.0, 125.0, 819.0], "score": 0.99, "text": "eucalypt species (Bosch, 1979; Van Lill et al., 1980;"}, {"category_id": 15, "poly": [131.0, 819.0, 732.0, 819.0, 732.0, 849.0, 131.0, 849.0], "score": 0.98, "text": "Van Wyk, 1987; Bosch and Von Gadow, 1990; Scott"}, {"category_id": 15, "poly": [129.0, 854.0, 730.0, 854.0, 730.0, 884.0, 129.0, 884.0], "score": 0.99, "text": "and Smith, 1997; Scott et al., 2000). In regions, where"}, {"category_id": 15, "poly": [129.0, 888.0, 732.0, 888.0, 732.0, 918.0, 129.0, 918.0], "score": 1.0, "text": "water is an increasingly valuable resource, prediction"}, {"category_id": 15, "poly": [125.0, 914.0, 735.0, 916.0, 734.0, 952.0, 125.0, 950.0], "score": 1.0, "text": " of the long-term hydrologic impact of afforestation is"}, {"category_id": 15, "poly": [127.0, 952.0, 732.0, 952.0, 732.0, 983.0, 127.0, 983.0], "score": 1.0, "text": "a prerequisite for the optimal planning of catchment"}, {"category_id": 15, "poly": [126.0, 982.0, 232.0, 987.0, 231.0, 1017.0, 124.0, 1012.0], "score": 0.98, "text": "land use."}, {"category_id": 15, "poly": [129.0, 619.0, 572.0, 619.0, 572.0, 649.0, 129.0, 649.0], "score": 0.97, "text": "2001). Once annual rainfall exceeds "}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 234.0, 127.0, 234.0], "score": 0.86, "text": "254"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}], "page_info": {"page_no": 1, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [117.54735565185547, 651.1103515625, 250.780029296875, 651.1103515625, 250.780029296875, 683.0104370117188, 117.54735565185547, 683.0104370117188], "score": 0.9999984502792358}, {"category_id": 0, "poly": [118.68109130859375, 719.37060546875, 523.2320556640625, 719.37060546875, 523.2320556640625, 748.71435546875, 118.68109130859375, 748.71435546875], "score": 0.9999982714653015}, {"category_id": 1, "poly": [782.3466796875, 254.3662872314453, 1379.406005859375, 254.3662872314453, 1379.406005859375, 382.8451843261719, 782.3466796875, 382.8451843261719], "score": 0.9999969005584717}, {"category_id": 2, "poly": [466.16595458984375, 194.14617919921875, 1030.9322509765625, 194.14617919921875, 1030.9322509765625, 218.86849975585938, 466.16595458984375, 218.86849975585938], "score": 0.9999963641166687}, {"category_id": 9, "poly": [1347.212890625, 1178.8819580078125, 1379.9034423828125, 1178.8819580078125, 1379.9034423828125, 1209.0960693359375, 1347.212890625, 1209.0960693359375], "score": 0.9999951124191284}, {"category_id": 1, "poly": [118.17451477050781, 252.63734436035156, 717.2734375, 252.63734436035156, 717.2734375, 582.23974609375, 118.17451477050781, 582.23974609375], "score": 0.999994158744812}, {"category_id": 1, "poly": [780.9387817382812, 518.9439697265625, 1381.2352294921875, 518.9439697265625, 1381.2352294921875, 1114.6259765625, 780.9387817382812, 1114.6259765625], "score": 0.9999930262565613}, {"category_id": 9, "poly": [1346.75439453125, 438.8963317871094, 1380.3604736328125, 438.8963317871094, 1380.3604736328125, 467.5118713378906, 1346.75439453125, 467.5118713378906], "score": 0.9999922513961792}, {"category_id": 1, "poly": [781.1512451171875, 1283.9832763671875, 1380.4686279296875, 1283.9832763671875, 1380.4686279296875, 1845.6868896484375, 781.1512451171875, 1845.6868896484375], "score": 0.9999905824661255}, {"category_id": 1, "poly": [118.1343994140625, 788.8043212890625, 716.4190673828125, 788.8043212890625, 716.4190673828125, 1282.203125, 118.1343994140625, 1282.203125], "score": 0.9999904632568359}, {"category_id": 2, "poly": [1346.32177734375, 194.7462615966797, 1381.36328125, 194.7462615966797, 1381.36328125, 216.9466552734375, 1346.32177734375, 216.9466552734375], "score": 0.9999903440475464}, {"category_id": 1, "poly": [117.631591796875, 1283.8558349609375, 716.6098022460938, 1283.8558349609375, 716.6098022460938, 1847.49853515625, 117.631591796875, 1847.49853515625], "score": 0.9999891519546509}, {"category_id": 8, "poly": [778.0137939453125, 1156.5975341796875, 1201.7086181640625, 1156.5975341796875, 1201.7086181640625, 1238.48828125, 778.0137939453125, 1238.48828125], "score": 0.9998936653137207}, {"category_id": 8, "poly": [779.0469360351562, 433.1261901855469, 996.4776000976562, 433.1261901855469, 996.4776000976562, 470.7110595703125, 779.0469360351562, 470.7110595703125], "score": 0.979882001876831}, {"category_id": 14, "poly": [777, 1156, 1200, 1156, 1200, 1237, 777, 1237], "score": 0.92, "latex": "Q_{\\mathcal{U}}=a+b(\\Delta P)+\\frac{Y}{1+\\exp\\!\\left(\\frac{T-T_{\\mathrm{half}}}{S}\\right)}"}, {"category_id": 13, "poly": [1150, 520, 1201, 520, 1201, 551, 1150, 551], "score": 0.9, "latex": "f(P)"}, {"category_id": 13, "poly": [1210, 1384, 1262, 1384, 1262, 1414, 1210, 1414], "score": 0.9, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [856, 520, 897, 520, 897, 550, 856, 550], "score": 0.9, "latex": "Q_{\\mathcal{k}}"}, {"category_id": 13, "poly": [930, 552, 982, 552, 982, 584, 930, 584], "score": 0.89, "latex": "g(T)"}, {"category_id": 13, "poly": [857, 1285, 898, 1285, 898, 1315, 857, 1315], "score": 0.89, "latex": "Q_{\\mathcal{k}}"}, {"category_id": 13, "poly": [1196, 1649, 1278, 1649, 1278, 1678, 1196, 1678], "score": 0.89, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 13, "poly": [1270, 1483, 1311, 1483, 1311, 1515, 1270, 1515], "score": 0.89, "latex": "Q_{\\mathrm{\\small{\\mathscr{k}}}}"}, {"category_id": 13, "poly": [1259, 1418, 1301, 1418, 1301, 1449, 1259, 1449], "score": 0.89, "latex": "Q_{\\mathbb{X}}"}, {"category_id": 13, "poly": [1075, 1682, 1140, 1682, 1140, 1711, 1075, 1711], "score": 0.88, "latex": "a+Y."}, {"category_id": 13, "poly": [895, 1483, 976, 1483, 976, 1512, 895, 1512], "score": 0.88, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 13, "poly": [1206, 1285, 1252, 1285, 1252, 1315, 1206, 1315], "score": 0.88, "latex": "Q_{50}"}, {"category_id": 13, "poly": [779, 1682, 821, 1682, 821, 1714, 779, 1714], "score": 0.88, "latex": "Q_{\\mathrm{\\%}}"}, {"category_id": 13, "poly": [1313, 1649, 1374, 1649, 1374, 1678, 1313, 1678], "score": 0.87, "latex": "T{=}0"}, {"category_id": 14, "poly": [777, 432, 997, 432, 997, 470, 777, 470], "score": 0.83, "latex": "\\begin{array}{r}{Q_{\\%}=f(P)+g(T)}\\end{array}"}, {"category_id": 13, "poly": [963, 1350, 1002, 1350, 1002, 1378, 963, 1378], "score": 0.8, "latex": "\\Delta P"}, {"category_id": 13, "poly": [989, 1318, 1012, 1318, 1012, 1345, 989, 1345], "score": 0.64, "latex": "Y"}, {"category_id": 13, "poly": [1077, 1318, 1098, 1318, 1098, 1345, 1077, 1345], "score": 0.64, "latex": "S"}, {"category_id": 13, "poly": [1239, 1583, 1262, 1583, 1262, 1611, 1239, 1611], "score": 0.51, "latex": "S"}, {"category_id": 13, "poly": [989, 1488, 1008, 1488, 1008, 1511, 989, 1511], "score": 0.3, "latex": "a"}, {"category_id": 15, "poly": [112.0, 651.0, 256.0, 651.0, 256.0, 688.0, 112.0, 688.0], "score": 0.96, "text": "2. Methods"}, {"category_id": 15, "poly": [112.0, 716.0, 526.0, 720.0, 526.0, 757.0, 112.0, 752.0], "score": 0.99, "text": "2.1. Characterisation of fow regime"}, {"category_id": 15, "poly": [778.0, 249.0, 1383.0, 252.0, 1383.0, 288.0, 777.0, 286.0], "score": 0.99, "text": " closure, a time term is required to represent plantation"}, {"category_id": 15, "poly": [777.0, 288.0, 1385.0, 284.0, 1385.0, 320.0, 778.0, 325.0], "score": 0.99, "text": "growth. A simple model relating the time series of"}, {"category_id": 15, "poly": [778.0, 318.0, 1383.0, 323.0, 1383.0, 357.0, 777.0, 353.0], "score": 0.99, "text": "each decile with rainfall and vegetation characteristics"}, {"category_id": 15, "poly": [782.0, 357.0, 1018.0, 357.0, 1018.0, 387.0, 782.0, 387.0], "score": 0.99, "text": "can be expressed as:"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [112.0, 249.0, 722.0, 252.0, 722.0, 288.0, 112.0, 286.0], "score": 0.99, "text": "were to (1) fit a model to the observed annual time"}, {"category_id": 15, "poly": [116.0, 290.0, 719.0, 290.0, 719.0, 320.0, 116.0, 320.0], "score": 0.98, "text": "series of FDC percentiles; i.e. 10th percentile for each"}, {"category_id": 15, "poly": [112.0, 320.0, 722.0, 320.0, 722.0, 357.0, 112.0, 357.0], "score": 0.99, "text": " year of record with annual rainfall and plantation age"}, {"category_id": 15, "poly": [116.0, 355.0, 719.0, 355.0, 719.0, 385.0, 116.0, 385.0], "score": 1.0, "text": "as parameters, (2) replace the annual rainfall variation"}, {"category_id": 15, "poly": [116.0, 389.0, 722.0, 389.0, 722.0, 419.0, 116.0, 419.0], "score": 0.98, "text": "with the long term mean to obtain climate adjusted"}, {"category_id": 15, "poly": [116.0, 421.0, 719.0, 421.0, 719.0, 452.0, 116.0, 452.0], "score": 0.99, "text": "FDCs, and (3) quantify changes in FDC percentiles as"}, {"category_id": 15, "poly": [116.0, 456.0, 717.0, 456.0, 717.0, 486.0, 116.0, 486.0], "score": 1.0, "text": "plantations age. If the climate signal, represented by"}, {"category_id": 15, "poly": [112.0, 482.0, 722.0, 486.0, 721.0, 523.0, 112.0, 518.0], "score": 0.98, "text": "rainfall, could be successfully removed, the resulting"}, {"category_id": 15, "poly": [116.0, 522.0, 719.0, 522.0, 719.0, 553.0, 116.0, 553.0], "score": 0.99, "text": "changes in the FDC would be solely attributable to the"}, {"category_id": 15, "poly": [114.0, 557.0, 243.0, 557.0, 243.0, 587.0, 114.0, 587.0], "score": 1.0, "text": "vegetation."}, {"category_id": 15, "poly": [780.0, 587.0, 1385.0, 587.0, 1385.0, 617.0, 780.0, 617.0], "score": 0.99, "text": "plantation. Annual rainfall was chosen as the rainfall"}, {"category_id": 15, "poly": [780.0, 621.0, 1385.0, 621.0, 1385.0, 649.0, 780.0, 649.0], "score": 0.99, "text": "statistic as it proved to be the most robust predictor of"}, {"category_id": 15, "poly": [780.0, 654.0, 1385.0, 654.0, 1385.0, 684.0, 780.0, 684.0], "score": 0.97, "text": "flow over the whole range of flow percentiles, as"}, {"category_id": 15, "poly": [777.0, 686.0, 1383.0, 686.0, 1383.0, 722.0, 777.0, 722.0], "score": 0.98, "text": " compared with rainfall percentiles; e.g. median rain-"}, {"category_id": 15, "poly": [777.0, 718.0, 1385.0, 718.0, 1385.0, 755.0, 777.0, 755.0], "score": 0.97, "text": "fall versus 10th flow percentile. The use of annual"}, {"category_id": 15, "poly": [775.0, 748.0, 1385.0, 750.0, 1385.0, 787.0, 775.0, 785.0], "score": 0.99, "text": "rainfall also minimises parameter complexity. The"}, {"category_id": 15, "poly": [782.0, 787.0, 1383.0, 787.0, 1383.0, 817.0, 782.0, 817.0], "score": 0.98, "text": "choice of model form is dependent on selecting a"}, {"category_id": 15, "poly": [780.0, 821.0, 1383.0, 821.0, 1383.0, 849.0, 780.0, 849.0], "score": 0.99, "text": "function that describes the relationship between forest"}, {"category_id": 15, "poly": [777.0, 854.0, 1383.0, 851.0, 1383.0, 881.0, 778.0, 884.0], "score": 0.98, "text": "age and ET. Scott and Smith (1997\uff09 demonstrated"}, {"category_id": 15, "poly": [780.0, 886.0, 1383.0, 886.0, 1383.0, 916.0, 780.0, 916.0], "score": 0.98, "text": "cumulative reductions in annual and low flows"}, {"category_id": 15, "poly": [780.0, 920.0, 1383.0, 920.0, 1383.0, 950.0, 780.0, 950.0], "score": 0.98, "text": "resulting from afforestation fitted a sigmoidal"}, {"category_id": 15, "poly": [777.0, 952.0, 1379.0, 952.0, 1379.0, 983.0, 777.0, 983.0], "score": 0.99, "text": "function, similar to forest growth functions. Conse-"}, {"category_id": 15, "poly": [775.0, 985.0, 1385.0, 983.0, 1385.0, 1019.0, 775.0, 1021.0], "score": 0.99, "text": " quently, we used a sigmoidal function to characterise"}, {"category_id": 15, "poly": [780.0, 1019.0, 1381.0, 1019.0, 1381.0, 1049.0, 780.0, 1049.0], "score": 0.99, "text": "the impact of plantation growth on each fow decile."}, {"category_id": 15, "poly": [780.0, 1054.0, 1383.0, 1054.0, 1383.0, 1084.0, 780.0, 1084.0], "score": 0.98, "text": "Fig. 2a is a schematic of the change in the FDC over"}, {"category_id": 15, "poly": [777.0, 1086.0, 1143.0, 1086.0, 1143.0, 1116.0, 777.0, 1116.0], "score": 0.99, "text": "time. The model took the form:"}, {"category_id": 15, "poly": [1202.0, 522.0, 1385.0, 522.0, 1385.0, 550.0, 1202.0, 550.0], "score": 0.99, "text": "is a function of"}, {"category_id": 15, "poly": [782.0, 522.0, 855.0, 522.0, 855.0, 550.0, 782.0, 550.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [898.0, 522.0, 1149.0, 522.0, 1149.0, 550.0, 898.0, 550.0], "score": 0.98, "text": "is the percentile flow,"}, {"category_id": 15, "poly": [780.0, 555.0, 929.0, 555.0, 929.0, 585.0, 780.0, 585.0], "score": 0.95, "text": "rainfall and"}, {"category_id": 15, "poly": [983.0, 555.0, 1383.0, 555.0, 1383.0, 585.0, 983.0, 585.0], "score": 0.98, "text": " is a function of the age of the"}, {"category_id": 15, "poly": [780.0, 1453.0, 1385.0, 1453.0, 1385.0, 1484.0, 780.0, 1484.0], "score": 1.0, "text": "afforestation has taken place. For the average climate"}, {"category_id": 15, "poly": [775.0, 1516.0, 1383.0, 1516.0, 1383.0, 1546.0, 775.0, 1546.0], "score": 0.98, "text": "the new equilibrium plantation water use under"}, {"category_id": 15, "poly": [777.0, 1552.0, 1385.0, 1552.0, 1385.0, 1582.0, 777.0, 1582.0], "score": 0.99, "text": " afforestation is reached. Y then gives the magnitude"}, {"category_id": 15, "poly": [780.0, 1619.0, 1385.0, 1619.0, 1385.0, 1649.0, 780.0, 1649.0], "score": 0.97, "text": "the shape of the response as shown in Fig. 2b. For"}, {"category_id": 15, "poly": [780.0, 1718.0, 1383.0, 1718.0, 1383.0, 1748.0, 780.0, 1748.0], "score": 0.98, "text": "afforestation condition would not require the time"}, {"category_id": 15, "poly": [780.0, 1752.0, 1383.0, 1752.0, 1383.0, 1782.0, 780.0, 1782.0], "score": 0.98, "text": "term. Details of the optimisation scheme and"}, {"category_id": 15, "poly": [780.0, 1784.0, 1383.0, 1784.0, 1383.0, 1815.0, 780.0, 1815.0], "score": 1.0, "text": "sensitivity tests on initial parameter values are given"}, {"category_id": 15, "poly": [780.0, 1817.0, 1020.0, 1817.0, 1020.0, 1847.0, 780.0, 1847.0], "score": 0.97, "text": "in Lane et al. (2003)."}, {"category_id": 15, "poly": [777.0, 1382.0, 1209.0, 1382.0, 1209.0, 1419.0, 777.0, 1419.0], "score": 0.98, "text": "from the period of record average, and"}, {"category_id": 15, "poly": [1263.0, 1382.0, 1385.0, 1382.0, 1385.0, 1419.0, 1263.0, 1419.0], "score": 0.99, "text": "is the time"}, {"category_id": 15, "poly": [782.0, 1286.0, 856.0, 1286.0, 856.0, 1316.0, 782.0, 1316.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [777.0, 1649.0, 1195.0, 1649.0, 1195.0, 1686.0, 777.0, 1686.0], "score": 1.0, "text": "the average pre-treatment condition"}, {"category_id": 15, "poly": [1312.0, 1486.0, 1385.0, 1486.0, 1385.0, 1516.0, 1312.0, 1516.0], "score": 1.0, "text": "when"}, {"category_id": 15, "poly": [780.0, 1419.0, 1258.0, 1419.0, 1258.0, 1449.0, 780.0, 1449.0], "score": 0.97, "text": "in years at which half of the reduction in"}, {"category_id": 15, "poly": [1302.0, 1419.0, 1385.0, 1419.0, 1385.0, 1449.0, 1302.0, 1449.0], "score": 1.0, "text": "due to"}, {"category_id": 15, "poly": [1141.0, 1686.0, 1379.0, 1686.0, 1379.0, 1716.0, 1141.0, 1716.0], "score": 0.95, "text": " Estimation of a pre-"}, {"category_id": 15, "poly": [780.0, 1486.0, 894.0, 1486.0, 894.0, 1516.0, 780.0, 1516.0], "score": 1.0, "text": "condition"}, {"category_id": 15, "poly": [899.0, 1286.0, 1205.0, 1286.0, 1205.0, 1316.0, 899.0, 1316.0], "score": 0.98, "text": "is the percentile flow (i.e."}, {"category_id": 15, "poly": [1253.0, 1286.0, 1383.0, 1286.0, 1383.0, 1316.0, 1253.0, 1316.0], "score": 1.0, "text": "is the 50th"}, {"category_id": 15, "poly": [822.0, 1686.0, 1074.0, 1686.0, 1074.0, 1716.0, 822.0, 1716.0], "score": 0.99, "text": " approximately equals"}, {"category_id": 15, "poly": [1279.0, 1649.0, 1312.0, 1649.0, 1312.0, 1686.0, 1279.0, 1686.0], "score": 1.0, "text": "at"}, {"category_id": 15, "poly": [777.0, 1352.0, 962.0, 1350.0, 962.0, 1380.0, 778.0, 1382.0], "score": 1.0, "text": "sigmoidal term,"}, {"category_id": 15, "poly": [1003.0, 1352.0, 1385.0, 1350.0, 1385.0, 1380.0, 1003.0, 1382.0], "score": 0.99, "text": "is the deviation of annual rainfall"}, {"category_id": 15, "poly": [775.0, 1316.0, 988.0, 1314.0, 988.0, 1350.0, 775.0, 1352.0], "score": 0.97, "text": "percentile flow),"}, {"category_id": 15, "poly": [1013.0, 1316.0, 1076.0, 1314.0, 1076.0, 1350.0, 1013.0, 1352.0], "score": 0.9, "text": " and"}, {"category_id": 15, "poly": [1099.0, 1316.0, 1385.0, 1314.0, 1385.0, 1350.0, 1099.0, 1352.0], "score": 0.98, "text": " are coefficients of the"}, {"category_id": 15, "poly": [780.0, 1587.0, 1238.0, 1587.0, 1238.0, 1617.0, 780.0, 1617.0], "score": 0.99, "text": "of change due to afforestation, and "}, {"category_id": 15, "poly": [1263.0, 1587.0, 1385.0, 1587.0, 1385.0, 1617.0, 1263.0, 1617.0], "score": 0.99, "text": " describes"}, {"category_id": 15, "poly": [1009.0, 1486.0, 1269.0, 1486.0, 1269.0, 1516.0, 1009.0, 1516.0], "score": 1.0, "text": "becomes the value of"}, {"category_id": 15, "poly": [144.0, 783.0, 720.0, 785.0, 719.0, 821.0, 144.0, 819.0], "score": 0.99, "text": "Flow duration curves display the relationship"}, {"category_id": 15, "poly": [116.0, 821.0, 719.0, 821.0, 719.0, 851.0, 116.0, 851.0], "score": 0.96, "text": "between streamflow and the percentage of time"}, {"category_id": 15, "poly": [116.0, 854.0, 717.0, 854.0, 717.0, 884.0, 116.0, 884.0], "score": 0.98, "text": "the streamflow is exceeded as a cumulative density"}, {"category_id": 15, "poly": [116.0, 888.0, 719.0, 888.0, 719.0, 918.0, 116.0, 918.0], "score": 1.0, "text": "function They can be constructed for any time period"}, {"category_id": 15, "poly": [116.0, 920.0, 715.0, 920.0, 715.0, 950.0, 116.0, 950.0], "score": 0.99, "text": "(daily, weekly, monthly, etc.) and provide a graphical"}, {"category_id": 15, "poly": [114.0, 952.0, 717.0, 955.0, 717.0, 985.0, 114.0, 983.0], "score": 0.99, "text": "and statistical view of historic streamflow variability"}, {"category_id": 15, "poly": [114.0, 987.0, 717.0, 987.0, 717.0, 1017.0, 114.0, 1017.0], "score": 0.99, "text": "in a single catchment or a comparison of inter-"}, {"category_id": 15, "poly": [112.0, 1017.0, 722.0, 1017.0, 722.0, 1054.0, 112.0, 1054.0], "score": 0.99, "text": "catchment flow regimes. Vogel and Fennessey (1994)"}, {"category_id": 15, "poly": [110.0, 1047.0, 722.0, 1049.0, 722.0, 1086.0, 109.0, 1084.0], "score": 0.99, "text": "and Smakhtin (1999, 2001) demonstrate the utility"}, {"category_id": 15, "poly": [114.0, 1088.0, 719.0, 1088.0, 719.0, 1118.0, 114.0, 1118.0], "score": 1.0, "text": "(and caveats) of FDCs in characterising, comparing"}, {"category_id": 15, "poly": [114.0, 1120.0, 722.0, 1120.0, 722.0, 1150.0, 114.0, 1150.0], "score": 0.97, "text": "and predicting flow regimes at varying temporal"}, {"category_id": 15, "poly": [112.0, 1150.0, 724.0, 1150.0, 724.0, 1187.0, 112.0, 1187.0], "score": 0.98, "text": "scales. Fig. 1 is an example of annual FDCs"}, {"category_id": 15, "poly": [114.0, 1187.0, 722.0, 1187.0, 722.0, 1217.0, 114.0, 1217.0], "score": 0.99, "text": "constructed from daily flows. For the consideration"}, {"category_id": 15, "poly": [110.0, 1215.0, 722.0, 1217.0, 722.0, 1253.0, 109.0, 1251.0], "score": 0.99, "text": " of annual flow regime, daily fows are an appropriate"}, {"category_id": 15, "poly": [114.0, 1253.0, 477.0, 1253.0, 477.0, 1284.0, 114.0, 1284.0], "score": 0.99, "text": "time step for FDC construction."}, {"category_id": 15, "poly": [1342.0, 189.0, 1387.0, 189.0, 1387.0, 234.0, 1342.0, 234.0], "score": 1.0, "text": "255"}, {"category_id": 15, "poly": [148.0, 1284.0, 715.0, 1284.0, 715.0, 1314.0, 148.0, 1314.0], "score": 0.99, "text": "FDCs were computed from the distribution of daily"}, {"category_id": 15, "poly": [112.0, 1316.0, 720.0, 1320.0, 719.0, 1350.0, 112.0, 1346.0], "score": 1.0, "text": "flows for each year of record based on the appropriate"}, {"category_id": 15, "poly": [116.0, 1352.0, 719.0, 1352.0, 719.0, 1382.0, 116.0, 1382.0], "score": 0.99, "text": "water years (May-April or November-October) for"}, {"category_id": 15, "poly": [112.0, 1380.0, 722.0, 1382.0, 722.0, 1419.0, 112.0, 1417.0], "score": 0.96, "text": "10 Southern Hemisphere catchments. Each 10th"}, {"category_id": 15, "poly": [114.0, 1419.0, 719.0, 1417.0, 720.0, 1447.0, 114.0, 1449.0], "score": 0.97, "text": "percentile (decile\uff09 was extracted from the annual"}, {"category_id": 15, "poly": [112.0, 1449.0, 720.0, 1451.0, 719.0, 1481.0, 112.0, 1479.0], "score": 0.99, "text": "FDCs of each catchment to form the data sets for"}, {"category_id": 15, "poly": [114.0, 1486.0, 719.0, 1486.0, 719.0, 1516.0, 114.0, 1516.0], "score": 0.99, "text": "analysis. For the purpose of characterising changes in"}, {"category_id": 15, "poly": [114.0, 1518.0, 719.0, 1518.0, 719.0, 1546.0, 114.0, 1546.0], "score": 1.0, "text": "each of the deciles, it is assumed that the time series is"}, {"category_id": 15, "poly": [114.0, 1550.0, 719.0, 1550.0, 719.0, 1580.0, 114.0, 1580.0], "score": 0.96, "text": "principally a function of climate and vegetation"}, {"category_id": 15, "poly": [114.0, 1585.0, 722.0, 1585.0, 722.0, 1615.0, 114.0, 1615.0], "score": 0.99, "text": "characteristics. Given rainfall is generally the most"}, {"category_id": 15, "poly": [114.0, 1619.0, 722.0, 1619.0, 722.0, 1649.0, 114.0, 1649.0], "score": 0.97, "text": "important factor affecting streamflow and the most"}, {"category_id": 15, "poly": [116.0, 1651.0, 719.0, 1651.0, 719.0, 1681.0, 116.0, 1681.0], "score": 0.98, "text": "easily accessed data, it is chosen to represent the"}, {"category_id": 15, "poly": [116.0, 1686.0, 719.0, 1686.0, 719.0, 1716.0, 116.0, 1716.0], "score": 1.0, "text": "climate. Catchment physical properties such as soil"}, {"category_id": 15, "poly": [114.0, 1716.0, 722.0, 1716.0, 722.0, 1752.0, 114.0, 1752.0], "score": 0.98, "text": "properties and topography are assumed to be time"}, {"category_id": 15, "poly": [116.0, 1752.0, 719.0, 1752.0, 719.0, 1782.0, 116.0, 1782.0], "score": 0.99, "text": "invariant and therefore their impact on runoff is"}, {"category_id": 15, "poly": [118.0, 1784.0, 719.0, 1784.0, 719.0, 1815.0, 118.0, 1815.0], "score": 1.0, "text": "considered constant throughout the analysis. As trees"}, {"category_id": 15, "poly": [116.0, 1819.0, 715.0, 1819.0, 715.0, 1849.0, 116.0, 1849.0], "score": 1.0, "text": "intercept and transpire at increasing rates until canopy"}], "page_info": {"page_no": 2, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 4, "poly": [129.72743225097656, 1201.7288818359375, 731.863037109375, 1201.7288818359375, 731.863037109375, 1256.5126953125, 129.72743225097656, 1256.5126953125], "score": 0.9999990463256836}, {"category_id": 1, "poly": [130.2001953125, 1783.3021240234375, 730.197509765625, 1783.3021240234375, 730.197509765625, 1846.7928466796875, 130.2001953125, 1846.7928466796875], "score": 0.9999982714653015}, {"category_id": 0, "poly": [797.18896484375, 501.0386047363281, 1060.7071533203125, 501.0386047363281, 1060.7071533203125, 529.1184692382812, 797.18896484375, 529.1184692382812], "score": 0.9999982714653015}, {"category_id": 2, "poly": [130.7757568359375, 195.0663299560547, 166.40858459472656, 195.0663299560547, 166.40858459472656, 215.67367553710938, 130.7757568359375, 215.67367553710938], "score": 0.9999974966049194}, {"category_id": 9, "poly": [1360.5223388671875, 807.8145751953125, 1393.251953125, 807.8145751953125, 1393.251953125, 835.9564819335938, 1360.5223388671875, 835.9564819335938], "score": 0.9999971389770508}, {"category_id": 3, "poly": [140.5875244140625, 256.1985778808594, 711.4976806640625, 256.1985778808594, 711.4976806640625, 1180.2288818359375, 140.5875244140625, 1180.2288818359375], "score": 0.999996542930603}, {"category_id": 1, "poly": [795.8214721679688, 1244.741455078125, 1393.836181640625, 1244.741455078125, 1393.836181640625, 1508.3616943359375, 795.8214721679688, 1508.3616943359375], "score": 0.9999949932098389}, {"category_id": 2, "poly": [480.60809326171875, 195.57171630859375, 1043.654296875, 195.57171630859375, 1043.654296875, 218.8836212158203, 480.60809326171875, 218.8836212158203], "score": 0.9999940395355225}, {"category_id": 1, "poly": [794.92333984375, 878.8724365234375, 1394.78515625, 878.8724365234375, 1394.78515625, 1241.811279296875, 794.92333984375, 1241.811279296875], "score": 0.9999939799308777}, {"category_id": 8, "poly": [792.0145263671875, 779.4395751953125, 1107.5592041015625, 779.4395751953125, 1107.5592041015625, 865.4520263671875, 792.0145263671875, 865.4520263671875], "score": 0.9999935030937195}, {"category_id": 1, "poly": [794.2274780273438, 567.8933715820312, 1393.5377197265625, 567.8933715820312, 1393.5377197265625, 762.2144775390625, 794.2274780273438, 762.2144775390625], "score": 0.9999918341636658}, {"category_id": 1, "poly": [795.5938110351562, 1715.463134765625, 1394.151611328125, 1715.463134765625, 1394.151611328125, 1845.3857421875, 795.5938110351562, 1845.3857421875], "score": 0.999987781047821}, {"category_id": 1, "poly": [794.4356689453125, 255.30477905273438, 1393.678466796875, 255.30477905273438, 1393.678466796875, 447.8646240234375, 794.4356689453125, 447.8646240234375], "score": 0.9999871253967285}, {"category_id": 1, "poly": [130.53660583496094, 1355.89013671875, 730.9114379882812, 1355.89013671875, 730.9114379882812, 1652.1812744140625, 130.53660583496094, 1652.1812744140625], "score": 0.999987006187439}, {"category_id": 9, "poly": [696.6166381835938, 1699.391845703125, 728.77880859375, 1699.391845703125, 728.77880859375, 1727.2147216796875, 696.6166381835938, 1727.2147216796875], "score": 0.999981164932251}, {"category_id": 9, "poly": [1360.9091796875, 1667.6871337890625, 1393.8095703125, 1667.6871337890625, 1393.8095703125, 1699.094482421875, 1360.9091796875, 1699.094482421875], "score": 0.9999788999557495}, {"category_id": 8, "poly": [790.2078857421875, 1522.67236328125, 1111.4049072265625, 1522.67236328125, 1111.4049072265625, 1604.606689453125, 790.2078857421875, 1604.606689453125], "score": 0.9999706149101257}, {"category_id": 9, "poly": [1361.0799560546875, 1545.7677001953125, 1393.7020263671875, 1545.7677001953125, 1393.7020263671875, 1573.452392578125, 1361.0799560546875, 1573.452392578125], "score": 0.9998459815979004}, {"category_id": 8, "poly": [127.09381866455078, 1678.0965576171875, 565.4200439453125, 1678.0965576171875, 565.4200439453125, 1756.1007080078125, 127.09381866455078, 1756.1007080078125], "score": 0.9997967481613159}, {"category_id": 8, "poly": [794.1704711914062, 1666.248779296875, 974.3306274414062, 1666.248779296875, 974.3306274414062, 1700.88720703125, 794.1704711914062, 1700.88720703125], "score": 0.9997556209564209}, {"category_id": 0, "poly": [131.9687042236328, 1288.984375, 435.8473205566406, 1288.984375, 435.8473205566406, 1316.791259765625, 131.9687042236328, 1316.791259765625], "score": 0.9995421767234802}, {"category_id": 1, "poly": [794.0263671875, 1622.5870361328125, 839.6729125976562, 1622.5870361328125, 839.6729125976562, 1647.691650390625, 794.0263671875, 1647.691650390625], "score": 0.9984337687492371}, {"category_id": 14, "poly": [790, 777, 1108, 777, 1108, 863, 790, 863], "score": 0.94, "latex": "E=1.0-\\frac{\\sum_{i=1}^{N}(O_{i}-P_{i})^{2}}{\\sum_{i=1}^{N}(O_{i}-\\bar{O})^{2}}"}, {"category_id": 14, "poly": [790, 1521, 1110, 1521, 1110, 1602, 790, 1602], "score": 0.94, "latex": "Q_{\\mathcal{Q}}=a+\\frac{Y}{1+\\exp\\left(\\frac{T-T_{\\mathrm{half}}}{S}\\right)}"}, {"category_id": 14, "poly": [125, 1674, 566, 1674, 566, 1756, 125, 1756], "score": 0.93, "latex": "N_{\\mathrm{zero}}=a+b(\\Delta P)+\\frac{Y}{1+\\exp\\left(\\frac{T-T_{\\mathrm{half}}}{S}\\right)}"}, {"category_id": 13, "poly": [1306, 319, 1388, 319, 1388, 349, 1306, 349], "score": 0.91, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 13, "poly": [529, 1555, 589, 1555, 589, 1585, 529, 1585], "score": 0.9, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [1281, 1176, 1365, 1176, 1365, 1205, 1281, 1205], "score": 0.9, "latex": "E\\!>\\!0.7"}, {"category_id": 13, "poly": [880, 1173, 931, 1173, 931, 1206, 880, 1206], "score": 0.89, "latex": "<\\!r^{2}"}, {"category_id": 13, "poly": [873, 1409, 932, 1409, 932, 1438, 873, 1438], "score": 0.89, "latex": "b\\!=\\!0"}, {"category_id": 13, "poly": [597, 1522, 656, 1522, 656, 1552, 597, 1552], "score": 0.89, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [792, 353, 856, 353, 856, 382, 792, 382], "score": 0.88, "latex": "a+Y"}, {"category_id": 13, "poly": [649, 1782, 731, 1782, 731, 1810, 649, 1810], "score": 0.88, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 14, "poly": [791, 1663, 976, 1663, 976, 1699, 791, 1699], "score": 0.88, "latex": "Q_{\\%}=a+b\\Delta P"}, {"category_id": 13, "poly": [1199, 1409, 1259, 1409, 1259, 1438, 1199, 1438], "score": 0.87, "latex": "Y{=}\\,0"}, {"category_id": 13, "poly": [513, 1487, 585, 1487, 585, 1519, 513, 1519], "score": 0.85, "latex": "(N_{\\mathrm{zero}})"}, {"category_id": 13, "poly": [1335, 1073, 1362, 1073, 1362, 1104, 1335, 1104], "score": 0.84, "latex": "r^{2}"}, {"category_id": 13, "poly": [845, 908, 869, 908, 869, 938, 845, 938], "score": 0.81, "latex": "\\bar{O}"}, {"category_id": 13, "poly": [1123, 880, 1146, 880, 1146, 905, 1123, 905], "score": 0.79, "latex": "P"}, {"category_id": 13, "poly": [1344, 1145, 1367, 1145, 1367, 1171, 1344, 1171], "score": 0.79, "latex": "E"}, {"category_id": 13, "poly": [872, 879, 896, 879, 896, 905, 872, 905], "score": 0.77, "latex": "o"}, {"category_id": 13, "poly": [713, 1521, 731, 1521, 731, 1548, 713, 1548], "score": 0.76, "latex": "b"}, {"category_id": 13, "poly": [1274, 912, 1298, 912, 1298, 938, 1274, 938], "score": 0.76, "latex": "E"}, {"category_id": 13, "poly": [1347, 699, 1369, 699, 1369, 726, 1347, 726], "score": 0.75, "latex": "E"}, {"category_id": 13, "poly": [263, 1815, 326, 1815, 326, 1847, 263, 1847], "score": 0.74, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [185, 1814, 245, 1814, 245, 1845, 185, 1845], "score": 0.73, "latex": "T\\!\\!=\\!\\!0"}, {"category_id": 13, "poly": [1010, 1819, 1023, 1819, 1023, 1842, 1010, 1842], "score": 0.7, "latex": "t"}, {"category_id": 13, "poly": [1207, 565, 1246, 565, 1246, 596, 1207, 596], "score": 0.67, "latex": "(E)"}, {"category_id": 13, "poly": [1310, 979, 1364, 979, 1364, 1007, 1310, 1007], "score": 0.64, "latex": "-\\infty"}, {"category_id": 13, "poly": [1031, 1754, 1044, 1754, 1044, 1776, 1031, 1776], "score": 0.57, "latex": "t\\cdot"}, {"category_id": 13, "poly": [1313, 1818, 1326, 1818, 1326, 1842, 1313, 1842], "score": 0.57, "latex": "t\\cdot"}, {"category_id": 13, "poly": [960, 1073, 1001, 1073, 1001, 1108, 960, 1108], "score": 0.55, "latex": "(r^{2})"}, {"category_id": 13, "poly": [175, 1555, 194, 1555, 194, 1582, 175, 1582], "score": 0.47, "latex": "S"}, {"category_id": 13, "poly": [1020, 287, 1043, 287, 1043, 315, 1020, 315], "score": 0.38, "latex": "S"}, {"category_id": 13, "poly": [1016, 1076, 1040, 1076, 1040, 1105, 1016, 1105], "score": 0.36, "latex": "E"}, {"category_id": 13, "poly": [599, 1815, 660, 1815, 660, 1845, 599, 1845], "score": 0.35, "latex": "a,~Y"}, {"category_id": 13, "poly": [637, 1816, 660, 1816, 660, 1843, 637, 1843], "score": 0.32, "latex": "Y"}, {"category_id": 13, "poly": [184, 1814, 324, 1814, 324, 1847, 184, 1847], "score": 0.27, "latex": "T\\!\\!=\\!0,\\ N_{\\mathrm{zero}}"}, {"category_id": 15, "poly": [131.0, 1204.0, 732.0, 1204.0, 732.0, 1232.0, 131.0, 1232.0], "score": 1.0, "text": "Fig. 2. (a) Schematic of the change in the FDC over time, and"}, {"category_id": 15, "poly": [129.0, 1227.0, 447.0, 1232.0, 446.0, 1260.0, 129.0, 1255.0], "score": 0.98, "text": "(b) definition of model parameters."}, {"category_id": 15, "poly": [159.0, 1778.0, 648.0, 1778.0, 648.0, 1821.0, 159.0, 1821.0], "score": 0.99, "text": "For the average pre-treatment condition "}, {"category_id": 15, "poly": [327.0, 1819.0, 598.0, 1819.0, 598.0, 1849.0, 327.0, 1849.0], "score": 0.98, "text": " approximately equals"}, {"category_id": 15, "poly": [661.0, 1819.0, 728.0, 1819.0, 728.0, 1849.0, 661.0, 1849.0], "score": 1.0, "text": "gives"}, {"category_id": 15, "poly": [129.0, 1819.0, 183.0, 1819.0, 183.0, 1849.0, 129.0, 1849.0], "score": 0.88, "text": "and "}, {"category_id": 15, "poly": [793.0, 499.0, 1065.0, 499.0, 1065.0, 535.0, 793.0, 535.0], "score": 0.98, "text": "2.3. Statistical analyses"}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "256"}, {"category_id": 15, "poly": [825.0, 1245.0, 1396.0, 1245.0, 1396.0, 1275.0, 825.0, 1275.0], "score": 0.98, "text": "It is important to assess the significance of the"}, {"category_id": 15, "poly": [790.0, 1279.0, 1396.0, 1279.0, 1396.0, 1309.0, 790.0, 1309.0], "score": 0.97, "text": "model parameters to check the model assumptions"}, {"category_id": 15, "poly": [788.0, 1307.0, 1400.0, 1309.0, 1400.0, 1346.0, 788.0, 1344.0], "score": 1.0, "text": "that rainfall and forest age are driving changes in the"}, {"category_id": 15, "poly": [790.0, 1346.0, 1396.0, 1346.0, 1396.0, 1376.0, 790.0, 1376.0], "score": 0.99, "text": "FDC. The model (2) was split into simplified forms,"}, {"category_id": 15, "poly": [793.0, 1378.0, 1396.0, 1378.0, 1396.0, 1408.0, 793.0, 1408.0], "score": 1.0, "text": "where only the rainfall or time terms were included by"}, {"category_id": 15, "poly": [793.0, 1445.0, 1398.0, 1445.0, 1398.0, 1475.0, 793.0, 1475.0], "score": 0.99, "text": "Eq. (6). The component models (5) and (6) were then"}, {"category_id": 15, "poly": [790.0, 1477.0, 1233.0, 1477.0, 1233.0, 1507.0, 790.0, 1507.0], "score": 1.0, "text": "tested against the complete model, (2)."}, {"category_id": 15, "poly": [790.0, 1408.0, 872.0, 1408.0, 872.0, 1445.0, 790.0, 1445.0], "score": 0.99, "text": "setting"}, {"category_id": 15, "poly": [933.0, 1408.0, 1198.0, 1408.0, 1198.0, 1445.0, 933.0, 1445.0], "score": 0.99, "text": ", as shown in Eq. (5), or"}, {"category_id": 15, "poly": [1260.0, 1408.0, 1400.0, 1408.0, 1400.0, 1445.0, 1260.0, 1445.0], "score": 0.97, "text": "as shown in"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [790.0, 944.0, 1400.0, 944.0, 1400.0, 980.0, 790.0, 980.0], "score": 1.0, "text": "minus the ratio of the mean square error to the"}, {"category_id": 15, "poly": [790.0, 1008.0, 1400.0, 1011.0, 1400.0, 1047.0, 790.0, 1045.0], "score": 0.98, "text": "1.0. Higher values indicate greater agreement between"}, {"category_id": 15, "poly": [788.0, 1041.0, 1403.0, 1043.0, 1402.0, 1079.0, 788.0, 1077.0], "score": 0.97, "text": " observed and predicted data as per the coefficient of "}, {"category_id": 15, "poly": [793.0, 1112.0, 1400.0, 1112.0, 1400.0, 1142.0, 793.0, 1142.0], "score": 0.97, "text": "evaluating hydrologic modelling because it is a"}, {"category_id": 15, "poly": [790.0, 1213.0, 1121.0, 1213.0, 1121.0, 1241.0, 790.0, 1241.0], "score": 0.99, "text": "indicate adequate model fits."}, {"category_id": 15, "poly": [1366.0, 1178.0, 1398.0, 1178.0, 1398.0, 1208.0, 1366.0, 1208.0], "score": 1.0, "text": "to"}, {"category_id": 15, "poly": [793.0, 1178.0, 879.0, 1178.0, 879.0, 1208.0, 793.0, 1208.0], "score": 1.0, "text": "always"}, {"category_id": 15, "poly": [932.0, 1178.0, 1280.0, 1178.0, 1280.0, 1208.0, 932.0, 1208.0], "score": 1.0, "text": "we have arbitrarily considered"}, {"category_id": 15, "poly": [1363.0, 1075.0, 1402.0, 1075.0, 1402.0, 1112.0, 1363.0, 1112.0], "score": 1.0, "text": "in"}, {"category_id": 15, "poly": [788.0, 909.0, 844.0, 909.0, 844.0, 946.0, 788.0, 946.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [1147.0, 875.0, 1398.0, 877.0, 1398.0, 914.0, 1147.0, 912.0], "score": 0.97, "text": " are predicted values,"}, {"category_id": 15, "poly": [793.0, 1146.0, 1343.0, 1146.0, 1343.0, 1176.0, 793.0, 1176.0], "score": 1.0, "text": "measure of the deviation from the 1:1 line. As"}, {"category_id": 15, "poly": [1368.0, 1146.0, 1398.0, 1146.0, 1398.0, 1176.0, 1368.0, 1176.0], "score": 1.0, "text": "is"}, {"category_id": 15, "poly": [788.0, 875.0, 871.0, 877.0, 871.0, 914.0, 788.0, 912.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [897.0, 875.0, 1122.0, 877.0, 1122.0, 914.0, 897.0, 912.0], "score": 1.0, "text": "are observed data,"}, {"category_id": 15, "poly": [870.0, 909.0, 1273.0, 909.0, 1273.0, 946.0, 870.0, 946.0], "score": 0.97, "text": " is the mean for the entire period."}, {"category_id": 15, "poly": [1299.0, 909.0, 1398.0, 909.0, 1398.0, 946.0, 1299.0, 946.0], "score": 1.0, "text": "is unity"}, {"category_id": 15, "poly": [793.0, 980.0, 1309.0, 980.0, 1309.0, 1010.0, 793.0, 1010.0], "score": 1.0, "text": "variance in the observed data, and ranges from"}, {"category_id": 15, "poly": [1365.0, 980.0, 1398.0, 980.0, 1398.0, 1010.0, 1365.0, 1010.0], "score": 1.0, "text": "to"}, {"category_id": 15, "poly": [790.0, 1075.0, 959.0, 1075.0, 959.0, 1112.0, 790.0, 1112.0], "score": 1.0, "text": "determination"}, {"category_id": 15, "poly": [1041.0, 1075.0, 1334.0, 1075.0, 1334.0, 1112.0, 1041.0, 1112.0], "score": 0.98, "text": "is used in preference to"}, {"category_id": 15, "poly": [790.0, 596.0, 1398.0, 598.0, 1398.0, 634.0, 790.0, 632.0], "score": 1.0, "text": "Sutcliffe, 1970; Chiew and McMahon, 1993; Legates"}, {"category_id": 15, "poly": [788.0, 628.0, 1396.0, 632.0, 1396.0, 667.0, 788.0, 662.0], "score": 0.96, "text": " and McCabe, 1999) was used as the ^goodness of fit\u2019"}, {"category_id": 15, "poly": [793.0, 669.0, 1398.0, 669.0, 1398.0, 697.0, 793.0, 697.0], "score": 0.99, "text": "measure to evaluate the fit between observed and"}, {"category_id": 15, "poly": [790.0, 736.0, 905.0, 731.0, 907.0, 763.0, 791.0, 768.0], "score": 0.98, "text": "given by:"}, {"category_id": 15, "poly": [790.0, 701.0, 1346.0, 699.0, 1346.0, 729.0, 790.0, 731.0], "score": 0.99, "text": "predicted flow deciles (2) and zero flow days (3)."}, {"category_id": 15, "poly": [1370.0, 701.0, 1398.0, 699.0, 1398.0, 729.0, 1370.0, 731.0], "score": 1.0, "text": "is"}, {"category_id": 15, "poly": [827.0, 568.0, 1206.0, 568.0, 1206.0, 598.0, 827.0, 598.0], "score": 0.95, "text": "The coefficient of efficiency"}, {"category_id": 15, "poly": [1247.0, 568.0, 1398.0, 568.0, 1398.0, 598.0, 1247.0, 598.0], "score": 0.97, "text": "(Nash and"}, {"category_id": 15, "poly": [825.0, 1716.0, 1394.0, 1716.0, 1394.0, 1752.0, 825.0, 1752.0], "score": 0.99, "text": "For both the fow duration curve analysis and zero"}, {"category_id": 15, "poly": [795.0, 1784.0, 1392.0, 1784.0, 1392.0, 1815.0, 795.0, 1815.0], "score": 0.98, "text": "whether (5) and (6) were significantly different to (2)."}, {"category_id": 15, "poly": [790.0, 1812.0, 1009.0, 1815.0, 1009.0, 1851.0, 790.0, 1849.0], "score": 0.99, "text": "A critical value of"}, {"category_id": 15, "poly": [790.0, 1750.0, 1030.0, 1752.0, 1030.0, 1782.0, 790.0, 1780.0], "score": 1.0, "text": "flow days analysis, a"}, {"category_id": 15, "poly": [1045.0, 1750.0, 1394.0, 1752.0, 1394.0, 1782.0, 1045.0, 1780.0], "score": 0.98, "text": "-test was then performed to test"}, {"category_id": 15, "poly": [1024.0, 1812.0, 1312.0, 1815.0, 1312.0, 1851.0, 1024.0, 1849.0], "score": 1.0, "text": "exceeding the calculated"}, {"category_id": 15, "poly": [1327.0, 1812.0, 1396.0, 1815.0, 1396.0, 1851.0, 1327.0, 1849.0], "score": 1.0, "text": "value"}, {"category_id": 15, "poly": [795.0, 256.0, 1398.0, 256.0, 1398.0, 286.0, 795.0, 286.0], "score": 0.97, "text": "the magnitude of change in zero flow days due to"}, {"category_id": 15, "poly": [790.0, 389.0, 1398.0, 389.0, 1398.0, 419.0, 790.0, 419.0], "score": 0.96, "text": "new equilibrium condition under afforestation is"}, {"category_id": 15, "poly": [790.0, 421.0, 891.0, 421.0, 891.0, 452.0, 790.0, 452.0], "score": 1.0, "text": "reached."}, {"category_id": 15, "poly": [793.0, 322.0, 1305.0, 322.0, 1305.0, 353.0, 793.0, 353.0], "score": 1.0, "text": "response. For the average climate condition"}, {"category_id": 15, "poly": [857.0, 355.0, 1398.0, 355.0, 1398.0, 385.0, 857.0, 385.0], "score": 0.99, "text": "becomes the number of zero flow days when the"}, {"category_id": 15, "poly": [793.0, 290.0, 1019.0, 290.0, 1019.0, 320.0, 793.0, 320.0], "score": 0.98, "text": "afforestation, and"}, {"category_id": 15, "poly": [1044.0, 290.0, 1398.0, 290.0, 1398.0, 320.0, 1044.0, 320.0], "score": 0.95, "text": " describes the shape of the"}, {"category_id": 15, "poly": [157.0, 1350.0, 732.0, 1352.0, 732.0, 1389.0, 157.0, 1387.0], "score": 0.98, "text": " A notable feature of Fig. 1 is the increase in the"}, {"category_id": 15, "poly": [127.0, 1389.0, 735.0, 1389.0, 735.0, 1425.0, 127.0, 1425.0], "score": 0.99, "text": "number of zero fow days. A similar approach to"}, {"category_id": 15, "poly": [129.0, 1423.0, 735.0, 1423.0, 735.0, 1453.0, 129.0, 1453.0], "score": 0.98, "text": "Eq. (2), using an inverse sigmoidal function was"}, {"category_id": 15, "poly": [129.0, 1456.0, 732.0, 1456.0, 732.0, 1486.0, 129.0, 1486.0], "score": 0.98, "text": "employed to assess the impact of afforestation on the"}, {"category_id": 15, "poly": [129.0, 1589.0, 735.0, 1589.0, 735.0, 1619.0, 129.0, 1619.0], "score": 0.99, "text": "rainfall increases, and increases with plantation"}, {"category_id": 15, "poly": [126.0, 1624.0, 220.0, 1618.0, 222.0, 1651.0, 128.0, 1656.0], "score": 1.0, "text": "growth:"}, {"category_id": 15, "poly": [590.0, 1557.0, 732.0, 1557.0, 732.0, 1587.0, 590.0, 1587.0], "score": 1.0, "text": "decreases as"}, {"category_id": 15, "poly": [129.0, 1524.0, 596.0, 1524.0, 596.0, 1554.0, 129.0, 1554.0], "score": 0.98, "text": "the left hand side of Eq. (2) is replaced by"}, {"category_id": 15, "poly": [129.0, 1490.0, 512.0, 1490.0, 512.0, 1520.0, 129.0, 1520.0], "score": 0.99, "text": "number of zero flow days per year"}, {"category_id": 15, "poly": [586.0, 1490.0, 732.0, 1490.0, 732.0, 1520.0, 586.0, 1520.0], "score": 0.97, "text": ". In this case,"}, {"category_id": 15, "poly": [657.0, 1524.0, 712.0, 1524.0, 712.0, 1554.0, 657.0, 1554.0], "score": 0.97, "text": ", and"}, {"category_id": 15, "poly": [129.0, 1557.0, 174.0, 1557.0, 174.0, 1587.0, 129.0, 1587.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [195.0, 1557.0, 528.0, 1557.0, 528.0, 1587.0, 195.0, 1587.0], "score": 0.99, "text": "are constrained to negative as"}, {"category_id": 15, "poly": [129.0, 1288.0, 438.0, 1288.0, 438.0, 1324.0, 129.0, 1324.0], "score": 0.99, "text": "2.2. Zero fow day analysis"}, {"category_id": 15, "poly": [788.0, 1617.0, 844.0, 1617.0, 844.0, 1662.0, 788.0, 1662.0], "score": 1.0, "text": "and"}], "page_info": {"page_no": 3, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 1, "poly": [780.4981079101562, 951.0537109375, 1382.5201416015625, 951.0537109375, 1382.5201416015625, 1648.58154296875, 780.4981079101562, 1648.58154296875], "score": 0.9999959468841553}, {"category_id": 2, "poly": [466.9576110839844, 194.6658935546875, 1030.6968994140625, 194.6658935546875, 1030.6968994140625, 219.20504760742188, 466.9576110839844, 219.20504760742188], "score": 0.9999955892562866}, {"category_id": 0, "poly": [782.29931640625, 886.77197265625, 919.8917236328125, 886.77197265625, 919.8917236328125, 915.8782348632812, 782.29931640625, 915.8782348632812], "score": 0.9999911189079285}, {"category_id": 1, "poly": [782.0343017578125, 253.53207397460938, 1382.2440185546875, 253.53207397460938, 1382.2440185546875, 350.4256896972656, 782.0343017578125, 350.4256896972656], "score": 0.9999889731407166}, {"category_id": 1, "poly": [781.621826171875, 653.5359497070312, 1381.272705078125, 653.5359497070312, 1381.272705078125, 783.18798828125, 781.621826171875, 783.18798828125], "score": 0.9999889731407166}, {"category_id": 5, "poly": [182.05813598632812, 248.86912536621094, 650.3305053710938, 248.86912536621094, 650.3305053710938, 1845.613037109375, 182.05813598632812, 1845.613037109375], "score": 0.9999887347221375}, {"category_id": 1, "poly": [781.0881958007812, 1650.3038330078125, 1382.088134765625, 1650.3038330078125, 1382.088134765625, 1848.214111328125, 781.0881958007812, 1848.214111328125], "score": 0.9999865293502808}, {"category_id": 2, "poly": [1346.05322265625, 194.5203399658203, 1381.46875, 194.5203399658203, 1381.46875, 216.90557861328125, 1346.05322265625, 216.90557861328125], "score": 0.9999804496765137}, {"category_id": 8, "poly": [779.12451171875, 544.6279296875, 1165.58349609375, 544.6279296875, 1165.58349609375, 623.5341796875, 779.12451171875, 623.5341796875], "score": 0.9999717473983765}, {"category_id": 1, "poly": [781.6971435546875, 352.1080017089844, 1382.5953369140625, 352.1080017089844, 1382.5953369140625, 515.912109375, 781.6971435546875, 515.912109375], "score": 0.999969482421875}, {"category_id": 9, "poly": [1347.20849609375, 571.1251831054688, 1380.7503662109375, 571.1251831054688, 1380.7503662109375, 601.0969848632812, 1347.20849609375, 601.0969848632812], "score": 0.9999024868011475}, {"category_id": 7, "poly": [659.8250732421875, 882.5633544921875, 686.8219604492188, 882.5633544921875, 686.8219604492188, 1842.583251953125, 659.8250732421875, 1842.583251953125], "score": 0.9764553904533386}, {"category_id": 6, "poly": [112.29073333740234, 1497.288330078125, 169.8206329345703, 1497.288330078125, 169.8206329345703, 1843.9019775390625, 112.29073333740234, 1843.9019775390625], "score": 0.8885180950164795}, {"category_id": 14, "poly": [776, 546, 1164, 546, 1164, 622, 776, 622], "score": 0.91, "latex": "F=\\frac{[(\\mathrm{SSE_{s}-S S E_{c}})/(\\mathrm{df_{c}-d f_{s}})]}{\\mathrm{SSE_{c}/d f_{c}}}"}, {"category_id": 13, "poly": [1087, 415, 1135, 415, 1135, 447, 1087, 447], "score": 0.88, "latex": "F^{0.5}"}, {"category_id": 13, "poly": [1155, 1183, 1223, 1183, 1223, 1214, 1155, 1214], "score": 0.86, "latex": "100\\%"}, {"category_id": 13, "poly": [779, 1781, 820, 1781, 820, 1812, 779, 1812], "score": 0.82, "latex": "6\\%"}, {"category_id": 13, "poly": [831, 487, 852, 487, 852, 513, 831, 513], "score": 0.77, "latex": "F"}, {"category_id": 13, "poly": [1120, 390, 1133, 390, 1133, 413, 1120, 413], "score": 0.72, "latex": "t\\cdot"}, {"category_id": 13, "poly": [780, 423, 792, 423, 792, 446, 780, 446], "score": 0.49, "latex": "t\\cdot"}, {"category_id": 13, "poly": [1074, 1716, 1095, 1716, 1095, 1742, 1074, 1742], "score": 0.31, "latex": "P"}, {"category_id": 15, "poly": [814.0, 952.0, 1383.0, 952.0, 1383.0, 983.0, 814.0, 983.0], "score": 0.98, "text": "Daily streamflow data were obtained from 10"}, {"category_id": 15, "poly": [782.0, 987.0, 1383.0, 987.0, 1383.0, 1017.0, 782.0, 1017.0], "score": 0.99, "text": "catchment studies from southeastern Australia, New"}, {"category_id": 15, "poly": [780.0, 1019.0, 1383.0, 1019.0, 1383.0, 1049.0, 780.0, 1049.0], "score": 0.99, "text": "Zealand and South Africa. The initial criteria for"}, {"category_id": 15, "poly": [775.0, 1047.0, 1383.0, 1051.0, 1383.0, 1088.0, 775.0, 1084.0], "score": 0.99, "text": "selection of these catchments were a known veg-"}, {"category_id": 15, "poly": [780.0, 1088.0, 1385.0, 1088.0, 1385.0, 1118.0, 780.0, 1118.0], "score": 0.95, "text": "etation history and streamflow records of good"}, {"category_id": 15, "poly": [780.0, 1120.0, 1381.0, 1120.0, 1381.0, 1150.0, 780.0, 1150.0], "score": 0.99, "text": "quality. The ideal data sets were those with a lengthy"}, {"category_id": 15, "poly": [777.0, 1155.0, 1385.0, 1152.0, 1385.0, 1182.0, 778.0, 1185.0], "score": 0.97, "text": "pre- and post-treatment (plantation establishment)"}, {"category_id": 15, "poly": [775.0, 1215.0, 1387.0, 1217.0, 1387.0, 1253.0, 775.0, 1251.0], "score": 0.99, "text": " ment converted from grassland or a crop equivalent to"}, {"category_id": 15, "poly": [780.0, 1253.0, 1385.0, 1253.0, 1385.0, 1284.0, 780.0, 1284.0], "score": 0.99, "text": "plantation. In reality, all these criteria were not easy to"}, {"category_id": 15, "poly": [782.0, 1286.0, 1383.0, 1286.0, 1383.0, 1316.0, 782.0, 1316.0], "score": 0.99, "text": "satisfy. For example in Victoria, Australia, the best"}, {"category_id": 15, "poly": [780.0, 1320.0, 1385.0, 1320.0, 1385.0, 1348.0, 780.0, 1348.0], "score": 0.99, "text": "data is from Stewarts Creek, a set of decommissioned"}, {"category_id": 15, "poly": [780.0, 1352.0, 1385.0, 1352.0, 1385.0, 1382.0, 780.0, 1382.0], "score": 0.99, "text": "research catchments with 9 years of pre-treatment"}, {"category_id": 15, "poly": [780.0, 1387.0, 1383.0, 1387.0, 1383.0, 1417.0, 780.0, 1417.0], "score": 1.0, "text": "data and 25 years of post-treatment. Here, though, the"}, {"category_id": 15, "poly": [778.0, 1417.0, 1383.0, 1419.0, 1383.0, 1449.0, 777.0, 1447.0], "score": 0.98, "text": "treatment was a conversion from native eucalypt"}, {"category_id": 15, "poly": [780.0, 1453.0, 1385.0, 1453.0, 1385.0, 1484.0, 780.0, 1484.0], "score": 0.99, "text": "forest to pine. The assumption made for this data set is"}, {"category_id": 15, "poly": [780.0, 1486.0, 1383.0, 1486.0, 1383.0, 1516.0, 780.0, 1516.0], "score": 0.98, "text": "that the immediate post-treatment period may be"}, {"category_id": 15, "poly": [780.0, 1518.0, 1383.0, 1518.0, 1383.0, 1548.0, 780.0, 1548.0], "score": 0.99, "text": "viewed as a non-forested condition. This condition is"}, {"category_id": 15, "poly": [777.0, 1552.0, 1385.0, 1552.0, 1385.0, 1582.0, 777.0, 1582.0], "score": 0.99, "text": "likely to approximate the ET conditions of pasture or"}, {"category_id": 15, "poly": [780.0, 1587.0, 1385.0, 1587.0, 1385.0, 1617.0, 780.0, 1617.0], "score": 0.98, "text": "short crops for up to 3 years. Catchment details and"}, {"category_id": 15, "poly": [775.0, 1619.0, 1145.0, 1617.0, 1145.0, 1647.0, 775.0, 1649.0], "score": 1.0, "text": "treatments are given in Table 1."}, {"category_id": 15, "poly": [780.0, 1187.0, 1154.0, 1187.0, 1154.0, 1217.0, 780.0, 1217.0], "score": 0.98, "text": "flow record with approximately"}, {"category_id": 15, "poly": [1224.0, 1187.0, 1381.0, 1187.0, 1381.0, 1217.0, 1224.0, 1217.0], "score": 0.99, "text": "of the catch-"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [776.0, 881.0, 924.0, 886.0, 923.0, 925.0, 775.0, 920.0], "score": 0.96, "text": " 3. Data sets"}, {"category_id": 15, "poly": [782.0, 254.0, 1383.0, 254.0, 1383.0, 290.0, 782.0, 290.0], "score": 0.99, "text": "when comparing (5) and (2) would indicate the time"}, {"category_id": 15, "poly": [782.0, 290.0, 1381.0, 290.0, 1381.0, 320.0, 782.0, 320.0], "score": 1.0, "text": "term in (6) was required to improve the complete"}, {"category_id": 15, "poly": [778.0, 318.0, 1347.0, 320.0, 1347.0, 357.0, 777.0, 355.0], "score": 1.0, "text": "model and is therefore significant, and vice versa."}, {"category_id": 15, "poly": [782.0, 656.0, 1385.0, 656.0, 1385.0, 686.0, 782.0, 686.0], "score": 1.0, "text": "where SSE is the residual sum of the squared errors, df"}, {"category_id": 15, "poly": [780.0, 686.0, 1383.0, 686.0, 1383.0, 722.0, 780.0, 722.0], "score": 0.99, "text": "is degrees of freedom, and the subscripts s and c refer"}, {"category_id": 15, "poly": [778.0, 720.0, 1379.0, 722.0, 1379.0, 753.0, 777.0, 750.0], "score": 0.98, "text": "to the simplified model and complete models,"}, {"category_id": 15, "poly": [777.0, 752.0, 926.0, 752.0, 926.0, 789.0, 777.0, 789.0], "score": 0.97, "text": "respectively."}, {"category_id": 15, "poly": [816.0, 1651.0, 1381.0, 1651.0, 1381.0, 1681.0, 816.0, 1681.0], "score": 0.99, "text": "All catchments, with the exception of Traralgon"}, {"category_id": 15, "poly": [782.0, 1686.0, 1379.0, 1686.0, 1379.0, 1716.0, 782.0, 1716.0], "score": 1.0, "text": "Creek, were afforested with pine species, predomi-"}, {"category_id": 15, "poly": [782.0, 1752.0, 1379.0, 1752.0, 1379.0, 1782.0, 782.0, 1782.0], "score": 1.0, "text": "Cathedral Peak catchments. Traralgon Creek has only"}, {"category_id": 15, "poly": [776.0, 1812.0, 1147.0, 1817.0, 1147.0, 1851.0, 775.0, 1847.0], "score": 0.99, "text": "of which is Eucalyptus regnans."}, {"category_id": 15, "poly": [821.0, 1784.0, 1381.0, 1784.0, 1381.0, 1815.0, 821.0, 1815.0], "score": 0.99, "text": " pine, with the remainder eucalypts species, most"}, {"category_id": 15, "poly": [780.0, 1718.0, 1073.0, 1718.0, 1073.0, 1748.0, 780.0, 1748.0], "score": 0.97, "text": "nantly Pinus radiata, with"}, {"category_id": 15, "poly": [1096.0, 1718.0, 1381.0, 1718.0, 1381.0, 1748.0, 1096.0, 1748.0], "score": 0.96, "text": "patula planted at the two"}, {"category_id": 15, "poly": [1340.0, 189.0, 1387.0, 189.0, 1387.0, 239.0, 1340.0, 239.0], "score": 1.0, "text": "257"}, {"category_id": 15, "poly": [814.0, 355.0, 1383.0, 355.0, 1383.0, 385.0, 814.0, 385.0], "score": 0.98, "text": "Due to the constraint that the rainfall and time term"}, {"category_id": 15, "poly": [780.0, 456.0, 1381.0, 456.0, 1381.0, 486.0, 780.0, 486.0], "score": 0.97, "text": "the critical value for significance at the 0.05 level."}, {"category_id": 15, "poly": [1136.0, 415.0, 1383.0, 417.0, 1383.0, 454.0, 1136.0, 451.0], "score": 0.98, "text": ", and compared with"}, {"category_id": 15, "poly": [780.0, 486.0, 830.0, 488.0, 830.0, 518.0, 780.0, 516.0], "score": 1.0, "text": "The"}, {"category_id": 15, "poly": [853.0, 486.0, 1160.0, 488.0, 1160.0, 518.0, 853.0, 516.0], "score": 0.99, "text": "-statistic was calculated as:"}, {"category_id": 15, "poly": [782.0, 389.0, 1119.0, 389.0, 1119.0, 419.0, 782.0, 419.0], "score": 0.99, "text": "must be positive, a one tailed"}, {"category_id": 15, "poly": [1134.0, 389.0, 1381.0, 389.0, 1381.0, 419.0, 1134.0, 419.0], "score": 0.98, "text": "-test was applied. The"}, {"category_id": 15, "poly": [793.0, 415.0, 1086.0, 417.0, 1086.0, 454.0, 793.0, 451.0], "score": 0.96, "text": "-value was calculated as"}], "page_info": {"page_no": 4, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 1, "poly": [795.8170166015625, 1284.819091796875, 1393.7825927734375, 1284.819091796875, 1393.7825927734375, 1348.0101318359375, 795.8170166015625, 1348.0101318359375], "score": 0.999998927116394}, {"category_id": 0, "poly": [796.8157348632812, 1217.8375244140625, 1043.37890625, 1217.8375244140625, 1043.37890625, 1247.9854736328125, 796.8157348632812, 1247.9854736328125], "score": 0.9999985694885254}, {"category_id": 2, "poly": [129.85670471191406, 196.5391082763672, 165.74688720703125, 196.5391082763672, 165.74688720703125, 216.1761932373047, 129.85670471191406, 216.1761932373047], "score": 0.9999985098838806}, {"category_id": 6, "poly": [129.43519592285156, 1376.390380859375, 374.6851806640625, 1376.390380859375, 374.6851806640625, 1428.6553955078125, 129.43519592285156, 1428.6553955078125], "score": 0.9999979734420776}, {"category_id": 0, "poly": [796.9744873046875, 1150.6444091796875, 911.8010864257812, 1150.6444091796875, 911.8010864257812, 1181.93017578125, 796.9744873046875, 1181.93017578125], "score": 0.9999944567680359}, {"category_id": 1, "poly": [793.93212890625, 785.8515625, 1394.5316162109375, 785.8515625, 1394.5316162109375, 1081.7857666015625, 793.93212890625, 1081.7857666015625], "score": 0.9999939203262329}, {"category_id": 5, "poly": [123.9719009399414, 1433.21337890625, 1400.553466796875, 1433.21337890625, 1400.553466796875, 1814.204345703125, 123.9719009399414, 1814.204345703125], "score": 0.9999938011169434}, {"category_id": 1, "poly": [130.5817108154297, 786.2269897460938, 730.632080078125, 786.2269897460938, 730.632080078125, 1349.01904296875, 130.5817108154297, 1349.01904296875], "score": 0.9999901652336121}, {"category_id": 6, "poly": [128.56288146972656, 253.8047637939453, 514.1275024414062, 253.8047637939453, 514.1275024414062, 308.0131530761719, 128.56288146972656, 308.0131530761719], "score": 0.9999885559082031}, {"category_id": 5, "poly": [126.36900329589844, 314.1026611328125, 1399.912109375, 314.1026611328125, 1399.912109375, 690.7048950195312, 126.36900329589844, 690.7048950195312], "score": 0.9999723434448242}, {"category_id": 2, "poly": [479.1275939941406, 195.43199157714844, 1044.5283203125, 195.43199157714844, 1044.5283203125, 218.68853759765625, 479.1275939941406, 218.68853759765625], "score": 0.9999346733093262}, {"category_id": 7, "poly": [128.99925231933594, 698.5426635742188, 1394.448486328125, 698.5426635742188, 1394.448486328125, 749.8440551757812, 128.99925231933594, 749.8440551757812], "score": 0.9987799525260925}, {"category_id": 7, "poly": [127.37924194335938, 1819.0853271484375, 1038.7354736328125, 1819.0853271484375, 1038.7354736328125, 1844.94091796875, 127.37924194335938, 1844.94091796875], "score": 0.9987504482269287}, {"category_id": 13, "poly": [626, 696, 660, 696, 660, 720, 626, 720], "score": 0.86, "latex": "5\\%"}, {"category_id": 13, "poly": [1190, 697, 1224, 697, 1224, 720, 1190, 720], "score": 0.86, "latex": "5\\%"}, {"category_id": 13, "poly": [299, 724, 342, 724, 342, 748, 299, 748], "score": 0.85, "latex": "10\\%"}, {"category_id": 13, "poly": [128, 698, 146, 698, 146, 719, 128, 719], "score": 0.69, "latex": "P"}, {"category_id": 13, "poly": [719, 697, 737, 697, 737, 719, 719, 719], "score": 0.44, "latex": "T"}, {"category_id": 13, "poly": [356, 1404, 375, 1404, 375, 1426, 356, 1426], "score": 0.33, "latex": "E"}, {"category_id": 15, "poly": [827.0, 1286.0, 1396.0, 1286.0, 1396.0, 1316.0, 827.0, 1316.0], "score": 0.95, "text": "The fit of the complete model, Eq. (2), to the"}, {"category_id": 15, "poly": [795.0, 1322.0, 1394.0, 1322.0, 1394.0, 1352.0, 795.0, 1352.0], "score": 0.98, "text": "observed data was generally good. Table 2 gives"}, {"category_id": 15, "poly": [795.0, 1221.0, 1046.0, 1221.0, 1046.0, 1251.0, 795.0, 1251.0], "score": 1.0, "text": "4.1. Model evaluation"}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "258"}, {"category_id": 15, "poly": [131.0, 1378.0, 204.0, 1378.0, 204.0, 1402.0, 131.0, 1402.0], "score": 0.93, "text": "Table 3"}, {"category_id": 15, "poly": [127.0, 1397.0, 355.0, 1400.0, 355.0, 1436.0, 127.0, 1434.0], "score": 0.97, "text": "Coefficient of efficiency,"}, {"category_id": 15, "poly": [790.0, 1150.0, 915.0, 1150.0, 915.0, 1189.0, 790.0, 1189.0], "score": 1.0, "text": "4. Results"}, {"category_id": 15, "poly": [793.0, 787.0, 1396.0, 787.0, 1396.0, 817.0, 793.0, 817.0], "score": 0.99, "text": "and Redhill the lower BFI is matched by the shallow"}, {"category_id": 15, "poly": [793.0, 821.0, 1398.0, 821.0, 1398.0, 849.0, 793.0, 849.0], "score": 0.99, "text": "soils. Pre-treatment data is not available for all"}, {"category_id": 15, "poly": [793.0, 854.0, 1398.0, 854.0, 1398.0, 884.0, 793.0, 884.0], "score": 0.98, "text": "catchment in the data set, so it was decided for the"}, {"category_id": 15, "poly": [793.0, 886.0, 1396.0, 886.0, 1396.0, 916.0, 793.0, 916.0], "score": 1.0, "text": "sake of consistency in the analysis to start each of the"}, {"category_id": 15, "poly": [795.0, 920.0, 1396.0, 920.0, 1396.0, 950.0, 795.0, 950.0], "score": 0.98, "text": "data sets in the year of treatment. The FDCs were"}, {"category_id": 15, "poly": [795.0, 955.0, 1396.0, 955.0, 1396.0, 985.0, 795.0, 985.0], "score": 0.99, "text": "constructed for water years of May-April for eight"}, {"category_id": 15, "poly": [793.0, 987.0, 1396.0, 987.0, 1396.0, 1017.0, 793.0, 1017.0], "score": 1.0, "text": "catchments. The 2 Cathedral Peak catchments were"}, {"category_id": 15, "poly": [790.0, 1019.0, 1398.0, 1019.0, 1398.0, 1049.0, 790.0, 1049.0], "score": 0.98, "text": "analysed for November-October because of the"}, {"category_id": 15, "poly": [790.0, 1054.0, 1192.0, 1054.0, 1192.0, 1084.0, 790.0, 1084.0], "score": 1.0, "text": "summer rainfall maxima (Table 2)."}, {"category_id": 15, "poly": [163.0, 785.0, 732.0, 785.0, 732.0, 815.0, 163.0, 815.0], "score": 0.98, "text": "Data on soil characteristics have been obtained"}, {"category_id": 15, "poly": [125.0, 815.0, 735.0, 817.0, 734.0, 854.0, 125.0, 851.0], "score": 1.0, "text": "from published reports and personal communication"}, {"category_id": 15, "poly": [125.0, 849.0, 732.0, 851.0, 732.0, 888.0, 125.0, 886.0], "score": 1.0, "text": "with researchers, but is far from uniform, particularly"}, {"category_id": 15, "poly": [124.0, 886.0, 734.0, 881.0, 735.0, 918.0, 125.0, 922.0], "score": 1.0, "text": "regarding porosity. Consequently only an indication"}, {"category_id": 15, "poly": [129.0, 920.0, 732.0, 920.0, 732.0, 950.0, 129.0, 950.0], "score": 0.99, "text": "of mean depth is reported here. However, this does"}, {"category_id": 15, "poly": [125.0, 950.0, 732.0, 952.0, 732.0, 989.0, 125.0, 987.0], "score": 0.99, "text": " give some indication of the likely relative storage"}, {"category_id": 15, "poly": [129.0, 987.0, 732.0, 987.0, 732.0, 1017.0, 129.0, 1017.0], "score": 0.99, "text": "capacities of the catchments. To obtain insights into"}, {"category_id": 15, "poly": [129.0, 1021.0, 732.0, 1021.0, 732.0, 1051.0, 129.0, 1051.0], "score": 0.97, "text": "the pre-afforestation hydrologic characteristics a"}, {"category_id": 15, "poly": [129.0, 1054.0, 732.0, 1054.0, 732.0, 1084.0, 129.0, 1084.0], "score": 0.99, "text": "baseflow separation was performed on the daily"}, {"category_id": 15, "poly": [129.0, 1088.0, 732.0, 1088.0, 732.0, 1118.0, 129.0, 1118.0], "score": 0.97, "text": "fows for the first 3 years following disturbance,"}, {"category_id": 15, "poly": [129.0, 1118.0, 730.0, 1118.0, 730.0, 1148.0, 129.0, 1148.0], "score": 0.98, "text": "using the digital filtering method of Lyne and Hollick"}, {"category_id": 15, "poly": [129.0, 1152.0, 732.0, 1152.0, 732.0, 1182.0, 129.0, 1182.0], "score": 0.98, "text": "(1979) with a filter coefficient of 0.925 and three"}, {"category_id": 15, "poly": [125.0, 1185.0, 734.0, 1182.0, 735.0, 1219.0, 125.0, 1221.0], "score": 0.99, "text": " passes. The resultant average basefow index (BFI),"}, {"category_id": 15, "poly": [129.0, 1221.0, 730.0, 1221.0, 730.0, 1249.0, 129.0, 1249.0], "score": 0.98, "text": "the ratio of baseflow to total flow, is given in Table 1."}, {"category_id": 15, "poly": [129.0, 1251.0, 730.0, 1251.0, 730.0, 1281.0, 129.0, 1281.0], "score": 0.97, "text": "The Australian catchments display a notably"}, {"category_id": 15, "poly": [129.0, 1288.0, 732.0, 1288.0, 732.0, 1316.0, 129.0, 1316.0], "score": 0.98, "text": "lower BFI than the South African and New Zealand"}, {"category_id": 15, "poly": [127.0, 1320.0, 734.0, 1318.0, 735.0, 1348.0, 127.0, 1350.0], "score": 0.96, "text": "catchments. For Stewarts Creek, Pine Creek"}, {"category_id": 15, "poly": [129.0, 252.0, 208.0, 252.0, 208.0, 282.0, 129.0, 282.0], "score": 0.98, "text": "Table 2"}, {"category_id": 15, "poly": [129.0, 282.0, 513.0, 282.0, 513.0, 312.0, 129.0, 312.0], "score": 0.99, "text": "Significance of the rainfall and time terms"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [1225.0, 697.0, 1396.0, 697.0, 1396.0, 727.0, 1225.0, 727.0], "score": 0.98, "text": "level, * represents"}, {"category_id": 15, "poly": [129.0, 725.0, 298.0, 725.0, 298.0, 755.0, 129.0, 755.0], "score": 0.98, "text": "significance at the"}, {"category_id": 15, "poly": [343.0, 725.0, 941.0, 725.0, 941.0, 755.0, 343.0, 755.0], "score": 0.99, "text": "level, and na denotes too few data points for meaningful analysis."}, {"category_id": 15, "poly": [147.0, 697.0, 625.0, 697.0, 625.0, 727.0, 147.0, 727.0], "score": 0.98, "text": " indicates that the rainfall term was significant at the"}, {"category_id": 15, "poly": [661.0, 697.0, 718.0, 697.0, 718.0, 727.0, 661.0, 727.0], "score": 1.0, "text": "level,"}, {"category_id": 15, "poly": [738.0, 697.0, 1189.0, 697.0, 1189.0, 727.0, 738.0, 727.0], "score": 0.99, "text": "indicates that the time term was significant at the"}, {"category_id": 15, "poly": [129.0, 1821.0, 1037.0, 1821.0, 1037.0, 1849.0, 129.0, 1849.0], "score": 0.99, "text": "ns Indicates that no solution was found, and na denotes deciles with too few data points for analysis"}], "page_info": {"page_no": 5, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 2, "poly": [1345.084228515625, 193.99124145507812, 1383.04443359375, 193.99124145507812, 1383.04443359375, 217.28871154785156, 1345.084228515625, 217.28871154785156], "score": 0.9999984502792358}, {"category_id": 1, "poly": [778.7415161132812, 875.8572387695312, 1385.70263671875, 875.8572387695312, 1385.70263671875, 1045.03857421875, 778.7415161132812, 1045.03857421875], "score": 0.9999930262565613}, {"category_id": 1, "poly": [112.97018432617188, 850.864990234375, 721.0302124023438, 850.864990234375, 721.0302124023438, 1216.21875, 112.97018432617188, 1216.21875], "score": 0.9999922513961792}, {"category_id": 4, "poly": [777.7315673828125, 753.8668212890625, 1386.6640625, 753.8668212890625, 1386.6640625, 842.7579345703125, 777.7315673828125, 842.7579345703125], "score": 0.9999915957450867}, {"category_id": 1, "poly": [777.9397583007812, 1045.28857421875, 1386.9669189453125, 1045.28857421875, 1386.9669189453125, 1678.6064453125, 777.9397583007812, 1678.6064453125], "score": 0.9999915957450867}, {"category_id": 1, "poly": [112.64908599853516, 250.50961303710938, 720.302001953125, 250.50961303710938, 720.302001953125, 849.3114624023438, 112.64908599853516, 849.3114624023438], "score": 0.9999906420707703}, {"category_id": 1, "poly": [112.41944122314453, 1315.5491943359375, 721.3580932617188, 1315.5491943359375, 721.3580932617188, 1851.324462890625, 112.41944122314453, 1851.324462890625], "score": 0.9999880790710449}, {"category_id": 3, "poly": [776.4273681640625, 253.75418090820312, 1388.254638671875, 253.75418090820312, 1388.254638671875, 736.9627685546875, 776.4273681640625, 736.9627685546875], "score": 0.9999828338623047}, {"category_id": 2, "poly": [464.4588928222656, 193.45211791992188, 1032.725341796875, 193.45211791992188, 1032.725341796875, 219.19715881347656, 464.4588928222656, 219.19715881347656], "score": 0.9999587535858154}, {"category_id": 0, "poly": [115.3223876953125, 1251.6119384765625, 695.34326171875, 1251.6119384765625, 695.34326171875, 1287.6334228515625, 115.3223876953125, 1287.6334228515625], "score": 0.9989659786224365}, {"category_id": 1, "poly": [778.8644409179688, 1705.2630615234375, 1386.922119140625, 1705.2630615234375, 1386.922119140625, 1843.95654296875, 778.8644409179688, 1843.95654296875], "score": 0.99659264087677}, {"category_id": 13, "poly": [601, 1814, 711, 1814, 711, 1847, 601, 1847], "score": 0.9, "latex": "T{=}\\,2T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [878, 1079, 975, 1079, 975, 1110, 878, 1110], "score": 0.9, "latex": "Y/(Y+a)"}, {"category_id": 13, "poly": [780, 880, 833, 880, 833, 911, 780, 911], "score": 0.89, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [296, 319, 380, 319, 380, 349, 296, 349], "score": 0.89, "latex": "E\\!>\\!0.7"}, {"category_id": 13, "poly": [160, 1682, 231, 1682, 231, 1713, 160, 1713], "score": 0.88, "latex": "a+Y)"}, {"category_id": 13, "poly": [116, 320, 188, 320, 188, 351, 116, 351], "score": 0.88, "latex": "(77\\%)"}, {"category_id": 13, "poly": [268, 751, 324, 751, 324, 781, 268, 781], "score": 0.87, "latex": "80\\%"}, {"category_id": 13, "poly": [628, 585, 684, 585, 684, 615, 628, 615], "score": 0.87, "latex": "75\\%"}, {"category_id": 13, "poly": [602, 619, 644, 619, 644, 647, 602, 647], "score": 0.85, "latex": "9\\%"}, {"category_id": 13, "poly": [533, 784, 577, 784, 577, 814, 533, 814], "score": 0.83, "latex": "9\\%"}, {"category_id": 13, "poly": [323, 1384, 364, 1384, 364, 1412, 323, 1412], "score": 0.77, "latex": "\\Delta P"}, {"category_id": 13, "poly": [286, 852, 308, 852, 308, 879, 286, 879], "score": 0.75, "latex": "E"}, {"category_id": 13, "poly": [409, 885, 432, 885, 432, 912, 409, 912], "score": 0.71, "latex": "E"}, {"category_id": 13, "poly": [566, 1085, 590, 1085, 590, 1112, 566, 1112], "score": 0.7, "latex": "E"}, {"category_id": 13, "poly": [484, 254, 524, 254, 524, 284, 484, 284], "score": 0.7, "latex": "(E)"}, {"category_id": 13, "poly": [315, 919, 334, 919, 334, 946, 315, 946], "score": 0.66, "latex": "b"}, {"category_id": 13, "poly": [376, 587, 394, 587, 394, 614, 376, 614], "score": 0.62, "latex": "b"}, {"category_id": 13, "poly": [460, 1051, 478, 1051, 478, 1077, 460, 1077], "score": 0.59, "latex": "b"}, {"category_id": 13, "poly": [451, 319, 552, 319, 552, 350, 451, 350], "score": 0.46, "latex": "60\\%~0.8"}, {"category_id": 13, "poly": [498, 719, 522, 719, 522, 746, 498, 746], "score": 0.45, "latex": "Y"}, {"category_id": 15, "poly": [1342.0, 191.0, 1387.0, 191.0, 1387.0, 236.0, 1342.0, 236.0], "score": 1.0, "text": "259"}, {"category_id": 15, "poly": [780.0, 914.0, 1383.0, 914.0, 1383.0, 944.0, 780.0, 944.0], "score": 0.99, "text": "most deciles the adjusted FDCs are identical for 12"}, {"category_id": 15, "poly": [780.0, 948.0, 1381.0, 948.0, 1381.0, 978.0, 780.0, 978.0], "score": 0.98, "text": "and 20 years after treatment. This figure clearly"}, {"category_id": 15, "poly": [782.0, 983.0, 1381.0, 983.0, 1381.0, 1013.0, 782.0, 1013.0], "score": 0.98, "text": "demonstrates the necessity for FDC adjustment,"}, {"category_id": 15, "poly": [773.0, 1013.0, 1168.0, 1010.0, 1168.0, 1047.0, 773.0, 1049.0], "score": 0.99, "text": " particularly for the 20 years FDC."}, {"category_id": 15, "poly": [834.0, 882.0, 1383.0, 882.0, 1383.0, 912.0, 834.0, 912.0], "score": 0.99, "text": "values are given in Table 4. Fig. 3 shows that for"}, {"category_id": 15, "poly": [116.0, 955.0, 719.0, 955.0, 719.0, 985.0, 116.0, 985.0], "score": 0.98, "text": "general the model fits the higher flows (lower deciles)"}, {"category_id": 15, "poly": [116.0, 987.0, 719.0, 987.0, 719.0, 1017.0, 116.0, 1017.0], "score": 1.0, "text": "better, most of the poorer fits are in the 80-100"}, {"category_id": 15, "poly": [112.0, 1017.0, 722.0, 1015.0, 722.0, 1051.0, 112.0, 1054.0], "score": 0.97, "text": " percentile range. This can be expected given the results"}, {"category_id": 15, "poly": [116.0, 1120.0, 722.0, 1120.0, 722.0, 1150.0, 116.0, 1150.0], "score": 0.99, "text": "Glendhu 2 and for 10th and 20th percentiles from"}, {"category_id": 15, "poly": [114.0, 1150.0, 724.0, 1150.0, 724.0, 1187.0, 114.0, 1187.0], "score": 0.98, "text": "Cathedral Peak 3 may exaggerate the goodness of fit to"}, {"category_id": 15, "poly": [114.0, 1187.0, 646.0, 1187.0, 646.0, 1215.0, 114.0, 1215.0], "score": 0.98, "text": "the exact form of the model (Lane et al., 2003)."}, {"category_id": 15, "poly": [150.0, 854.0, 285.0, 854.0, 285.0, 884.0, 150.0, 884.0], "score": 1.0, "text": "The poorest"}, {"category_id": 15, "poly": [309.0, 854.0, 715.0, 854.0, 715.0, 884.0, 309.0, 884.0], "score": 0.99, "text": "values were those from Lambrechts-"}, {"category_id": 15, "poly": [116.0, 886.0, 408.0, 886.0, 408.0, 916.0, 116.0, 916.0], "score": 0.95, "text": "bos A and B. The high"}, {"category_id": 15, "poly": [433.0, 886.0, 719.0, 886.0, 719.0, 916.0, 433.0, 916.0], "score": 0.96, "text": "for 50-100th deciles at"}, {"category_id": 15, "poly": [114.0, 1088.0, 565.0, 1088.0, 565.0, 1118.0, 114.0, 1118.0], "score": 0.97, "text": "sensitivity analysis suggested that the"}, {"category_id": 15, "poly": [591.0, 1088.0, 722.0, 1088.0, 722.0, 1118.0, 591.0, 1118.0], "score": 0.97, "text": "values for"}, {"category_id": 15, "poly": [116.0, 920.0, 314.0, 920.0, 314.0, 950.0, 116.0, 950.0], "score": 1.0, "text": "Biesievlei, where"}, {"category_id": 15, "poly": [335.0, 920.0, 719.0, 920.0, 719.0, 950.0, 335.0, 950.0], "score": 0.99, "text": "was not significant are notable. In"}, {"category_id": 15, "poly": [114.0, 1054.0, 459.0, 1054.0, 459.0, 1084.0, 114.0, 1084.0], "score": 0.95, "text": "of the significance tests for "}, {"category_id": 15, "poly": [479.0, 1054.0, 719.0, 1054.0, 719.0, 1084.0, 479.0, 1084.0], "score": 0.96, "text": ". The results of the"}, {"category_id": 15, "poly": [780.0, 759.0, 1383.0, 759.0, 1383.0, 789.0, 780.0, 789.0], "score": 0.98, "text": "Fig. 3. Examples of observed and fow duration curves adjusted for"}, {"category_id": 15, "poly": [782.0, 787.0, 1381.0, 787.0, 1381.0, 815.0, 782.0, 815.0], "score": 0.97, "text": "average rainfall following afforestation for Stewarts Creek 5,"}, {"category_id": 15, "poly": [779.0, 810.0, 873.0, 815.0, 871.0, 845.0, 777.0, 840.0], "score": 1.0, "text": "Australia."}, {"category_id": 15, "poly": [810.0, 1043.0, 1385.0, 1045.0, 1385.0, 1081.0, 810.0, 1079.0], "score": 0.98, "text": "The relative net flow change due to afforestation is"}, {"category_id": 15, "poly": [775.0, 1109.0, 1387.0, 1112.0, 1387.0, 1148.0, 775.0, 1146.0], "score": 0.99, "text": " old equilibrium water use condition of pre-treatment"}, {"category_id": 15, "poly": [777.0, 1146.0, 1387.0, 1142.0, 1388.0, 1178.0, 778.0, 1183.0], "score": 1.0, "text": "vegetation to the new equilibrium condition at forest"}, {"category_id": 15, "poly": [777.0, 1178.0, 1385.0, 1176.0, 1385.0, 1213.0, 778.0, 1215.0], "score": 0.99, "text": "canopy closure. This quantity is plotted for all catchments"}, {"category_id": 15, "poly": [780.0, 1215.0, 1385.0, 1215.0, 1385.0, 1245.0, 780.0, 1245.0], "score": 0.98, "text": "in Fig. 4. Some deciles have been removed from the data"}, {"category_id": 15, "poly": [782.0, 1247.0, 1383.0, 1247.0, 1383.0, 1277.0, 782.0, 1277.0], "score": 0.98, "text": "set, the 10th and 50th percentile for Glendhu 2 and the"}, {"category_id": 15, "poly": [782.0, 1281.0, 1383.0, 1281.0, 1383.0, 1312.0, 782.0, 1312.0], "score": 1.0, "text": "10th and 20th percentiles from Cathedral Peak 3. The"}, {"category_id": 15, "poly": [780.0, 1314.0, 1383.0, 1314.0, 1383.0, 1344.0, 780.0, 1344.0], "score": 0.98, "text": "optimised value of a was zero or near zero for these cases,"}, {"category_id": 15, "poly": [780.0, 1348.0, 1383.0, 1348.0, 1383.0, 1378.0, 780.0, 1378.0], "score": 1.0, "text": "which is not consistent with the conceptual model. The"}, {"category_id": 15, "poly": [775.0, 1378.0, 1385.0, 1376.0, 1385.0, 1413.0, 775.0, 1415.0], "score": 0.98, "text": " changes shown in Fig. 4 are variable. However, there are"}, {"category_id": 15, "poly": [780.0, 1415.0, 1385.0, 1415.0, 1385.0, 1445.0, 780.0, 1445.0], "score": 1.0, "text": "some commonalities between catchment responses. Two"}, {"category_id": 15, "poly": [775.0, 1447.0, 1387.0, 1445.0, 1387.0, 1481.0, 775.0, 1484.0], "score": 0.99, "text": "types of responses (groups) were identified. Group 1"}, {"category_id": 15, "poly": [780.0, 1481.0, 1385.0, 1481.0, 1385.0, 1511.0, 780.0, 1511.0], "score": 1.0, "text": "catchments show a substantial increase in the number of"}, {"category_id": 15, "poly": [777.0, 1514.0, 1387.0, 1514.0, 1387.0, 1550.0, 777.0, 1550.0], "score": 0.96, "text": " zero flow days, with a greater proportional reduction in"}, {"category_id": 15, "poly": [782.0, 1548.0, 1385.0, 1548.0, 1385.0, 1578.0, 782.0, 1578.0], "score": 0.99, "text": "low flows than high fows. Group 2 catchments show a"}, {"category_id": 15, "poly": [780.0, 1582.0, 1383.0, 1582.0, 1383.0, 1612.0, 780.0, 1612.0], "score": 0.99, "text": "more uniform proportional reduction in fows across all"}, {"category_id": 15, "poly": [777.0, 1617.0, 1383.0, 1615.0, 1383.0, 1645.0, 778.0, 1647.0], "score": 1.0, "text": "percentiles, albeit with some variability. The catchments"}, {"category_id": 15, "poly": [776.0, 1644.0, 980.0, 1649.0, 979.0, 1686.0, 775.0, 1681.0], "score": 0.95, "text": " in each group are:"}, {"category_id": 15, "poly": [780.0, 1079.0, 877.0, 1079.0, 877.0, 1116.0, 780.0, 1116.0], "score": 1.0, "text": "givenby"}, {"category_id": 15, "poly": [976.0, 1079.0, 1385.0, 1079.0, 1385.0, 1116.0, 976.0, 1116.0], "score": 0.98, "text": ", which represents the change from the"}, {"category_id": 15, "poly": [116.0, 288.0, 717.0, 288.0, 717.0, 318.0, 116.0, 318.0], "score": 0.99, "text": "percentile at all the catchments. The majority of fits"}, {"category_id": 15, "poly": [116.0, 355.0, 719.0, 355.0, 719.0, 385.0, 116.0, 385.0], "score": 0.99, "text": "significance of the rainfall and time terms is given in"}, {"category_id": 15, "poly": [114.0, 387.0, 717.0, 387.0, 717.0, 417.0, 114.0, 417.0], "score": 1.0, "text": "Table 3 for all deciles, where solutions were found."}, {"category_id": 15, "poly": [112.0, 417.0, 720.0, 421.0, 719.0, 452.0, 112.0, 447.0], "score": 0.98, "text": "There were not enough data to fit the model in five"}, {"category_id": 15, "poly": [116.0, 456.0, 717.0, 456.0, 717.0, 484.0, 116.0, 484.0], "score": 0.98, "text": "instances because of extended periods of zero flows."}, {"category_id": 15, "poly": [116.0, 488.0, 719.0, 488.0, 719.0, 518.0, 116.0, 518.0], "score": 0.99, "text": "This problem is addressed to some extent in the zero"}, {"category_id": 15, "poly": [116.0, 522.0, 719.0, 522.0, 719.0, 550.0, 116.0, 550.0], "score": 1.0, "text": "fow analysis. If the rainfall signal is to be separated"}, {"category_id": 15, "poly": [114.0, 555.0, 719.0, 555.0, 719.0, 585.0, 114.0, 585.0], "score": 0.98, "text": "from the vegetation signal the rainfall terms must be"}, {"category_id": 15, "poly": [110.0, 649.0, 722.0, 651.0, 722.0, 688.0, 109.0, 686.0], "score": 0.99, "text": " 0.10 level. The incidence of significance was greatest"}, {"category_id": 15, "poly": [116.0, 688.0, 719.0, 688.0, 719.0, 718.0, 116.0, 718.0], "score": 0.99, "text": "for the 10-50th percentiles at 45 of the 50 data sets at"}, {"category_id": 15, "poly": [112.0, 817.0, 432.0, 815.0, 432.0, 851.0, 112.0, 854.0], "score": 0.98, "text": "significant at the 0.10 level."}, {"category_id": 15, "poly": [189.0, 320.0, 295.0, 320.0, 295.0, 350.0, 189.0, 350.0], "score": 1.0, "text": "returned"}, {"category_id": 15, "poly": [116.0, 755.0, 267.0, 755.0, 267.0, 785.0, 116.0, 785.0], "score": 1.0, "text": "results, with"}, {"category_id": 15, "poly": [325.0, 755.0, 719.0, 755.0, 719.0, 785.0, 325.0, 785.0], "score": 0.97, "text": " of the deciles significant at 0.05"}, {"category_id": 15, "poly": [685.0, 589.0, 722.0, 589.0, 722.0, 619.0, 685.0, 619.0], "score": 1.0, "text": "of"}, {"category_id": 15, "poly": [114.0, 621.0, 601.0, 621.0, 601.0, 649.0, 114.0, 649.0], "score": 0.98, "text": "the deciles at the 0.05 level, and a further"}, {"category_id": 15, "poly": [645.0, 621.0, 719.0, 621.0, 719.0, 649.0, 645.0, 649.0], "score": 0.99, "text": "at the"}, {"category_id": 15, "poly": [116.0, 787.0, 532.0, 787.0, 532.0, 815.0, 116.0, 815.0], "score": 0.96, "text": "level. There were an additional"}, {"category_id": 15, "poly": [578.0, 787.0, 719.0, 787.0, 719.0, 815.0, 578.0, 815.0], "score": 0.99, "text": "of deciles"}, {"category_id": 15, "poly": [110.0, 249.0, 483.0, 252.0, 483.0, 288.0, 109.0, 286.0], "score": 0.91, "text": "the coefficient of efficiency "}, {"category_id": 15, "poly": [525.0, 249.0, 720.0, 252.0, 719.0, 288.0, 525.0, 286.0], "score": 0.96, "text": "for each flow"}, {"category_id": 15, "poly": [116.0, 589.0, 375.0, 589.0, 375.0, 619.0, 116.0, 619.0], "score": 1.0, "text": "significant. This term,"}, {"category_id": 15, "poly": [395.0, 589.0, 627.0, 589.0, 627.0, 619.0, 395.0, 619.0], "score": 0.98, "text": ", was significant for"}, {"category_id": 15, "poly": [381.0, 320.0, 450.0, 320.0, 450.0, 350.0, 381.0, 350.0], "score": 0.9, "text": "\uff0cwith"}, {"category_id": 15, "poly": [553.0, 320.0, 719.0, 320.0, 719.0, 350.0, 553.0, 350.0], "score": 0.97, "text": "or better. The"}, {"category_id": 15, "poly": [116.0, 718.0, 497.0, 718.0, 497.0, 748.0, 116.0, 748.0], "score": 0.99, "text": "the 0.05 level. The time term,"}, {"category_id": 15, "poly": [523.0, 718.0, 717.0, 718.0, 717.0, 748.0, 523.0, 748.0], "score": 0.97, "text": "returned similar"}, {"category_id": 15, "poly": [148.0, 1318.0, 719.0, 1318.0, 719.0, 1348.0, 148.0, 1348.0], "score": 0.95, "text": "Following the successful fitting of (2) to the"}, {"category_id": 15, "poly": [114.0, 1352.0, 719.0, 1352.0, 719.0, 1382.0, 114.0, 1382.0], "score": 0.99, "text": "observed percentiles, the FDCs were adjusted for"}, {"category_id": 15, "poly": [114.0, 1419.0, 719.0, 1419.0, 719.0, 1447.0, 114.0, 1447.0], "score": 0.99, "text": "average annual rainfall. The climate adjusted FDCs"}, {"category_id": 15, "poly": [114.0, 1453.0, 719.0, 1453.0, 719.0, 1481.0, 114.0, 1481.0], "score": 0.96, "text": "produce an estimation of the change in flow"}, {"category_id": 15, "poly": [112.0, 1486.0, 722.0, 1483.0, 722.0, 1514.0, 112.0, 1516.0], "score": 0.98, "text": "percentiles over time for each catchment due to"}, {"category_id": 15, "poly": [116.0, 1518.0, 719.0, 1518.0, 719.0, 1548.0, 116.0, 1548.0], "score": 0.99, "text": "afforestation that may be viewed in two forms: new"}, {"category_id": 15, "poly": [114.0, 1552.0, 722.0, 1552.0, 722.0, 1580.0, 114.0, 1580.0], "score": 0.99, "text": "FDCs, adjusted for climate, as exemplified in Fig. 3"}, {"category_id": 15, "poly": [112.0, 1587.0, 721.0, 1582.0, 722.0, 1612.0, 112.0, 1617.0], "score": 0.99, "text": "for Stewarts Creek 5, and a comparison between all"}, {"category_id": 15, "poly": [114.0, 1619.0, 717.0, 1619.0, 717.0, 1649.0, 114.0, 1649.0], "score": 0.99, "text": "catchments of the maximum change in yield (given by"}, {"category_id": 15, "poly": [118.0, 1651.0, 722.0, 1651.0, 722.0, 1681.0, 118.0, 1681.0], "score": 0.99, "text": "Y) for each flow percentile from baseline flows (given"}, {"category_id": 15, "poly": [118.0, 1718.0, 715.0, 1718.0, 715.0, 1748.0, 118.0, 1748.0], "score": 0.98, "text": "equilibrium of maximum water use is reached, the"}, {"category_id": 15, "poly": [116.0, 1750.0, 719.0, 1750.0, 719.0, 1780.0, 116.0, 1780.0], "score": 0.99, "text": "adjusted FDCs for individual years should be identical"}, {"category_id": 15, "poly": [114.0, 1784.0, 719.0, 1784.0, 719.0, 1815.0, 114.0, 1815.0], "score": 0.98, "text": "if rainfall variability has been accounted for. The new"}, {"category_id": 15, "poly": [110.0, 1808.0, 600.0, 1813.0, 600.0, 1856.0, 109.0, 1851.0], "score": 0.96, "text": " equilibrium is approximately reached for "}, {"category_id": 15, "poly": [116.0, 1686.0, 159.0, 1686.0, 159.0, 1716.0, 116.0, 1716.0], "score": 1.0, "text": "by"}, {"category_id": 15, "poly": [232.0, 1686.0, 719.0, 1686.0, 719.0, 1716.0, 232.0, 1716.0], "score": 0.95, "text": " as shown in Fig. 4. Where the new"}, {"category_id": 15, "poly": [116.0, 1387.0, 322.0, 1387.0, 322.0, 1417.0, 116.0, 1417.0], "score": 1.0, "text": "climate by setting"}, {"category_id": 15, "poly": [365.0, 1387.0, 719.0, 1387.0, 719.0, 1417.0, 365.0, 1417.0], "score": 0.99, "text": "to zero, representing long term"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [112.0, 1249.0, 694.0, 1251.0, 694.0, 1288.0, 112.0, 1286.0], "score": 0.99, "text": "4.2. Adjusted FDCs\u2014magnitude of fow reductions"}, {"category_id": 15, "poly": [782.0, 1709.0, 1325.0, 1709.0, 1325.0, 1739.0, 782.0, 1739.0], "score": 0.99, "text": "Group 1: Stewarts Creek, Pine Creek, and Redhill"}, {"category_id": 15, "poly": [782.0, 1744.0, 1381.0, 1744.0, 1381.0, 1774.0, 782.0, 1774.0], "score": 0.99, "text": "Group 2: Cathedral Peak 2 and 3, Lambrechtsbos A,"}, {"category_id": 15, "poly": [889.0, 1776.0, 1383.0, 1776.0, 1383.0, 1806.0, 889.0, 1806.0], "score": 1.0, "text": "Lambrechtsbos B, Glendhu 2, Biesievlei and"}, {"category_id": 15, "poly": [889.0, 1810.0, 1072.0, 1810.0, 1072.0, 1840.0, 889.0, 1840.0], "score": 1.0, "text": "Traralgon Creek"}], "page_info": {"page_no": 6, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 6, "poly": [130.38211059570312, 1373.58056640625, 408.3111877441406, 1373.58056640625, 408.3111877441406, 1427.8253173828125, 130.38211059570312, 1427.8253173828125], "score": 0.9999985694885254}, {"category_id": 2, "poly": [131.90994262695312, 195.1804962158203, 165.77700805664062, 195.1804962158203, 165.77700805664062, 215.41661071777344, 131.90994262695312, 215.41661071777344], "score": 0.9999985098838806}, {"category_id": 2, "poly": [481.0845642089844, 195.8048095703125, 1043.9552001953125, 195.8048095703125, 1043.9552001953125, 218.32778930664062, 481.0845642089844, 218.32778930664062], "score": 0.9999977350234985}, {"category_id": 5, "poly": [124.61016845703125, 1434.242919921875, 1399.2454833984375, 1434.242919921875, 1399.2454833984375, 1811.951171875, 124.61016845703125, 1811.951171875], "score": 0.9999969005584717}, {"category_id": 4, "poly": [510.5360107421875, 734.0053100585938, 1013.9042358398438, 734.0053100585938, 1013.9042358398438, 758.7108154296875, 510.5360107421875, 758.7108154296875], "score": 0.9999968409538269}, {"category_id": 1, "poly": [131.32168579101562, 838.9521484375, 730.0957641601562, 838.9521484375, 730.0957641601562, 1314.5084228515625, 131.32168579101562, 1314.5084228515625], "score": 0.9999938011169434}, {"category_id": 3, "poly": [306.1774597167969, 253.64524841308594, 1219.746337890625, 253.64524841308594, 1219.746337890625, 705.4325561523438, 306.1774597167969, 705.4325561523438], "score": 0.9999911785125732}, {"category_id": 1, "poly": [794.51171875, 907.3822631835938, 1395.8782958984375, 907.3822631835938, 1395.8782958984375, 1313.7686767578125, 794.51171875, 1313.7686767578125], "score": 0.9999873042106628}, {"category_id": 7, "poly": [127.02899169921875, 1816.2164306640625, 940.5137939453125, 1816.2164306640625, 940.5137939453125, 1842.17822265625, 127.02899169921875, 1842.17822265625], "score": 0.999592661857605}, {"category_id": 0, "poly": [794.436767578125, 838.79541015625, 1117.543701171875, 838.79541015625, 1117.543701171875, 867.4995727539062, 794.436767578125, 867.4995727539062], "score": 0.9990140795707703}, {"category_id": 13, "poly": [759, 733, 840, 733, 840, 759, 759, 759], "score": 0.9, "latex": "Y/(Y+a)"}, {"category_id": 13, "poly": [815, 1077, 867, 1077, 867, 1108, 815, 1108], "score": 0.89, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [1088, 1179, 1140, 1179, 1140, 1211, 1088, 1211], "score": 0.89, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [130, 1247, 196, 1247, 196, 1277, 130, 1277], "score": 0.84, "latex": "100\\%"}, {"category_id": 13, "poly": [209, 1042, 276, 1042, 276, 1072, 209, 1072], "score": 0.84, "latex": "100\\%"}, {"category_id": 13, "poly": [1174, 940, 1224, 940, 1224, 971, 1174, 971], "score": 0.84, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [129, 1401, 172, 1401, 172, 1428, 129, 1428], "score": 0.7, "latex": "T_{\\mathrm{half}}"}, {"category_id": 15, "poly": [129.0, 1372.0, 208.0, 1372.0, 208.0, 1402.0, 129.0, 1402.0], "score": 0.93, "text": "Table 4"}, {"category_id": 15, "poly": [173.0, 1400.0, 408.0, 1397.0, 408.0, 1434.0, 173.0, 1436.0], "score": 1.0, "text": "(years) for all catchments"}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "260"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [509.0, 733.0, 758.0, 733.0, 758.0, 763.0, 509.0, 763.0], "score": 0.98, "text": "Fig. 4. Net flow reductions"}, {"category_id": 15, "poly": [841.0, 733.0, 1012.0, 733.0, 1012.0, 763.0, 841.0, 763.0], "score": 1.0, "text": "for all catchments."}, {"category_id": 15, "poly": [157.0, 836.0, 737.0, 834.0, 737.0, 871.0, 157.0, 873.0], "score": 0.99, "text": "Group 1 exhibit both the highest reduction of"}, {"category_id": 15, "poly": [127.0, 871.0, 732.0, 871.0, 732.0, 907.0, 127.0, 907.0], "score": 0.98, "text": "flows overall, and show the largest proportional"}, {"category_id": 15, "poly": [129.0, 909.0, 732.0, 909.0, 732.0, 940.0, 129.0, 940.0], "score": 0.96, "text": "reduction at lower flows, leading to a complete"}, {"category_id": 15, "poly": [129.0, 944.0, 732.0, 944.0, 732.0, 974.0, 129.0, 974.0], "score": 0.99, "text": "cessation of fow. Comparison of flow reductions is"}, {"category_id": 15, "poly": [125.0, 972.0, 735.0, 974.0, 734.0, 1011.0, 125.0, 1008.0], "score": 0.99, "text": "hindered slightly by the range of afforestation at the"}, {"category_id": 15, "poly": [129.0, 1010.0, 735.0, 1010.0, 735.0, 1041.0, 129.0, 1041.0], "score": 0.99, "text": "catchments (Table 1). These results could be scaled"}, {"category_id": 15, "poly": [129.0, 1079.0, 730.0, 1079.0, 730.0, 1109.0, 129.0, 1109.0], "score": 0.98, "text": "linear relationship between the area planted and flow"}, {"category_id": 15, "poly": [129.0, 1114.0, 732.0, 1114.0, 732.0, 1144.0, 129.0, 1144.0], "score": 0.98, "text": "reductions. As there is no evidence that this is the"}, {"category_id": 15, "poly": [129.0, 1146.0, 728.0, 1146.0, 728.0, 1176.0, 129.0, 1176.0], "score": 0.98, "text": "case we have not presented scaled reductions here."}, {"category_id": 15, "poly": [125.0, 1178.0, 737.0, 1176.0, 737.0, 1213.0, 125.0, 1215.0], "score": 0.95, "text": "Linear scaling would shift the reduction curves"}, {"category_id": 15, "poly": [125.0, 1213.0, 737.0, 1210.0, 737.0, 1247.0, 125.0, 1249.0], "score": 0.96, "text": "upward for those catchments that are less than"}, {"category_id": 15, "poly": [125.0, 1281.0, 492.0, 1286.0, 492.0, 1320.0, 124.0, 1316.0], "score": 0.98, "text": " of the curves or our groupings."}, {"category_id": 15, "poly": [197.0, 1245.0, 735.0, 1247.0, 734.0, 1284.0, 197.0, 1281.0], "score": 0.98, "text": " afforested, but would not change the shape"}, {"category_id": 15, "poly": [129.0, 1045.0, 208.0, 1045.0, 208.0, 1075.0, 129.0, 1075.0], "score": 0.99, "text": "upto"}, {"category_id": 15, "poly": [277.0, 1045.0, 735.0, 1045.0, 735.0, 1075.0, 277.0, 1075.0], "score": 0.98, "text": " afforested if it is assumed there is a"}, {"category_id": 15, "poly": [825.0, 903.0, 1398.0, 905.0, 1398.0, 942.0, 825.0, 940.0], "score": 0.99, "text": "The speed of fow responses to afforestation can be"}, {"category_id": 15, "poly": [793.0, 976.0, 1398.0, 976.0, 1398.0, 1006.0, 793.0, 1006.0], "score": 0.99, "text": "is substantial variation in response times both over the"}, {"category_id": 15, "poly": [788.0, 1008.0, 1402.0, 1004.0, 1403.0, 1041.0, 788.0, 1045.0], "score": 0.99, "text": " percentile spread in some individual catchments, and"}, {"category_id": 15, "poly": [790.0, 1045.0, 1398.0, 1045.0, 1398.0, 1075.0, 790.0, 1075.0], "score": 0.98, "text": " between the catchments. The majority of responses have"}, {"category_id": 15, "poly": [793.0, 1114.0, 1398.0, 1114.0, 1398.0, 1142.0, 793.0, 1142.0], "score": 0.99, "text": "Stewarts Creek, Redhill and Lambrechtsbos A exhibit the"}, {"category_id": 15, "poly": [793.0, 1148.0, 1398.0, 1148.0, 1398.0, 1178.0, 793.0, 1178.0], "score": 0.99, "text": "fastest responses, with Biesievlei showing the most"}, {"category_id": 15, "poly": [793.0, 1215.0, 1398.0, 1215.0, 1398.0, 1245.0, 793.0, 1245.0], "score": 0.99, "text": "catchments display a good correspondence to published"}, {"category_id": 15, "poly": [793.0, 1249.0, 1394.0, 1249.0, 1394.0, 1279.0, 793.0, 1279.0], "score": 0.99, "text": "annual changes (Scott et al., 2000; Van Wyk, 1987),"}, {"category_id": 15, "poly": [793.0, 1284.0, 1398.0, 1284.0, 1398.0, 1314.0, 793.0, 1314.0], "score": 0.99, "text": "excepting the 10-20th deciles for both Cathedral Peak"}, {"category_id": 15, "poly": [790.0, 1079.0, 814.0, 1079.0, 814.0, 1109.0, 790.0, 1109.0], "score": 0.96, "text": "a"}, {"category_id": 15, "poly": [868.0, 1079.0, 1398.0, 1079.0, 1398.0, 1109.0, 868.0, 1109.0], "score": 0.99, "text": "value between 5 and 10 years. Pine Creek and"}, {"category_id": 15, "poly": [788.0, 1178.0, 1087.0, 1176.0, 1087.0, 1213.0, 788.0, 1215.0], "score": 1.0, "text": "uniformly slow response."}, {"category_id": 15, "poly": [1141.0, 1178.0, 1400.0, 1176.0, 1400.0, 1213.0, 1141.0, 1215.0], "score": 0.94, "text": "for the South African"}, {"category_id": 15, "poly": [793.0, 942.0, 1173.0, 942.0, 1173.0, 972.0, 793.0, 972.0], "score": 1.0, "text": "evaluated by examining the value of"}, {"category_id": 15, "poly": [1225.0, 942.0, 1398.0, 942.0, 1398.0, 972.0, 1225.0, 972.0], "score": 0.97, "text": "(Table 4). There"}, {"category_id": 15, "poly": [129.0, 1815.0, 939.0, 1817.0, 939.0, 1847.0, 129.0, 1845.0], "score": 0.98, "text": "Note that no solution could be found for the 50 percentile for Glendhu indicted by the ns."}, {"category_id": 15, "poly": [793.0, 838.0, 1123.0, 838.0, 1123.0, 875.0, 793.0, 875.0], "score": 1.0, "text": "4.3. Timing of fow reductions"}], "page_info": {"page_no": 7, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [781.8721923828125, 1618.0399169921875, 928.7534790039062, 1618.0399169921875, 928.7534790039062, 1646.409912109375, 781.8721923828125, 1646.409912109375], "score": 0.9999984502792358}, {"category_id": 6, "poly": [113.6822738647461, 253.5222625732422, 1383.692626953125, 253.5222625732422, 1383.692626953125, 334.94287109375, 113.6822738647461, 334.94287109375], "score": 0.9999982714653015}, {"category_id": 2, "poly": [1345.8267822265625, 196.56283569335938, 1379.01416015625, 196.56283569335938, 1379.01416015625, 215.46395874023438, 1345.8267822265625, 215.46395874023438], "score": 0.999997615814209}, {"category_id": 2, "poly": [467.5944519042969, 195.73922729492188, 1030.1298828125, 195.73922729492188, 1030.1298828125, 218.0099639892578, 467.5944519042969, 218.0099639892578], "score": 0.9999966621398926}, {"category_id": 1, "poly": [117.23767852783203, 1243.1109619140625, 716.3004150390625, 1243.1109619140625, 716.3004150390625, 1847.4835205078125, 117.23767852783203, 1847.4835205078125], "score": 0.9999922513961792}, {"category_id": 1, "poly": [118.47014617919922, 994.3162841796875, 713.6297607421875, 994.3162841796875, 713.6297607421875, 1122.2752685546875, 118.47014617919922, 1122.2752685546875], "score": 0.9999918937683105}, {"category_id": 0, "poly": [119.10287475585938, 1178.6876220703125, 627.3995971679688, 1178.6876220703125, 627.3995971679688, 1206.4453125, 119.10287475585938, 1206.4453125], "score": 0.9999906420707703}, {"category_id": 1, "poly": [782.2766723632812, 1682.373291015625, 1380.46728515625, 1682.373291015625, 1380.46728515625, 1845.99609375, 782.2766723632812, 1845.99609375], "score": 0.9999901652336121}, {"category_id": 1, "poly": [781.3034057617188, 1060.36083984375, 1379.3385009765625, 1060.36083984375, 1379.3385009765625, 1521.511962890625, 781.3034057617188, 1521.511962890625], "score": 0.9999886751174927}, {"category_id": 5, "poly": [113.5753402709961, 341.08380126953125, 1386.5994873046875, 341.08380126953125, 1386.5994873046875, 689.2748413085938, 113.5753402709961, 689.2748413085938], "score": 0.9999498128890991}, {"category_id": 7, "poly": [118.46651458740234, 696.7362670898438, 1380.3681640625, 696.7362670898438, 1380.3681640625, 861.0716552734375, 118.46651458740234, 861.0716552734375], "score": 0.9997767210006714}, {"category_id": 0, "poly": [782.720458984375, 993.1400146484375, 988.5659790039062, 993.1400146484375, 988.5659790039062, 1020.3793334960938, 782.720458984375, 1020.3793334960938], "score": 0.9996650218963623}, {"category_id": 13, "poly": [458, 778, 601, 778, 601, 806, 458, 806], "score": 0.91, "latex": "\\sum Y/\\sum(a+Y)"}, {"category_id": 13, "poly": [169, 1025, 221, 1025, 221, 1056, 169, 1056], "score": 0.91, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [464, 750, 607, 750, 607, 778, 464, 778], "score": 0.88, "latex": "\\sum Y/\\sum(a+Y)"}, {"category_id": 13, "poly": [1201, 1191, 1277, 1191, 1277, 1221, 1201, 1221], "score": 0.88, "latex": "\\Delta N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [1296, 1323, 1350, 1323, 1350, 1353, 1296, 1353], "score": 0.86, "latex": "50\\%"}, {"category_id": 13, "poly": [1078, 1159, 1101, 1159, 1101, 1185, 1078, 1185], "score": 0.77, "latex": "E"}, {"category_id": 13, "poly": [1113, 1192, 1133, 1192, 1133, 1219, 1113, 1219], "score": 0.69, "latex": "b"}, {"category_id": 13, "poly": [375, 811, 390, 811, 390, 830, 375, 830], "score": 0.67, "latex": "a"}, {"category_id": 13, "poly": [990, 1196, 1003, 1196, 1003, 1218, 990, 1218], "score": 0.61, "latex": "t\\cdot"}, {"category_id": 13, "poly": [1066, 812, 1080, 812, 1080, 830, 1066, 830], "score": 0.58, "latex": "a"}, {"category_id": 13, "poly": [431, 808, 448, 808, 448, 830, 431, 830], "score": 0.46, "latex": "Y"}, {"category_id": 13, "poly": [1246, 1357, 1283, 1357, 1283, 1386, 1246, 1386], "score": 0.43, "latex": "\\mathrm{Ck}"}, {"category_id": 13, "poly": [773, 779, 827, 779, 827, 804, 773, 804], "score": 0.42, "latex": "100\\mathrm{th}"}, {"category_id": 13, "poly": [1107, 1357, 1144, 1357, 1144, 1386, 1107, 1386], "score": 0.41, "latex": "\\mathrm{Ck}"}, {"category_id": 13, "poly": [640, 807, 684, 807, 684, 831, 640, 831], "score": 0.29, "latex": "20\\mathrm{th}"}, {"category_id": 15, "poly": [776.0, 1612.0, 935.0, 1617.0, 933.0, 1656.0, 775.0, 1651.0], "score": 0.97, "text": " 5. Discussion"}, {"category_id": 15, "poly": [112.0, 252.0, 195.0, 252.0, 195.0, 282.0, 112.0, 282.0], "score": 0.98, "text": "Table5"}, {"category_id": 15, "poly": [112.0, 279.0, 1383.0, 282.0, 1383.0, 312.0, 112.0, 310.0], "score": 0.99, "text": " Published fow reductions from paired catchment analyses, after Scott et al. (2000), Hickel (2001), Nandakumar and Mein (1993) and Fahey and"}, {"category_id": 15, "poly": [112.0, 307.0, 681.0, 310.0, 681.0, 340.0, 112.0, 338.0], "score": 0.97, "text": "Jackson (1997) compared to estimated reductions in this study"}, {"category_id": 15, "poly": [1330.0, 202.0, 1368.0, 177.0, 1393.0, 215.0, 1355.0, 240.0], "score": 0.99, "text": "261"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [148.0, 1243.0, 720.0, 1245.0, 719.0, 1275.0, 148.0, 1273.0], "score": 0.98, "text": "A further check on the overall model performance is"}, {"category_id": 15, "poly": [112.0, 1279.0, 719.0, 1277.0, 720.0, 1307.0, 112.0, 1309.0], "score": 0.98, "text": " a comparison with published results of paired catchment"}, {"category_id": 15, "poly": [114.0, 1312.0, 719.0, 1312.0, 719.0, 1342.0, 114.0, 1342.0], "score": 1.0, "text": "studies. The data that can be compared with our results"}, {"category_id": 15, "poly": [114.0, 1346.0, 719.0, 1346.0, 719.0, 1376.0, 114.0, 1376.0], "score": 0.98, "text": "are presented in Table 5 and can be broadly compared"}, {"category_id": 15, "poly": [114.0, 1380.0, 719.0, 1380.0, 719.0, 1410.0, 114.0, 1410.0], "score": 0.99, "text": "with Fig. 4. These data are reductions in years with near"}, {"category_id": 15, "poly": [114.0, 1415.0, 722.0, 1415.0, 722.0, 1445.0, 114.0, 1445.0], "score": 0.99, "text": "average annual rainfall, and at a time after treatment "}, {"category_id": 15, "poly": [114.0, 1445.0, 717.0, 1445.0, 717.0, 1475.0, 114.0, 1475.0], "score": 0.99, "text": "when maximum changes in streamflow have occurred."}, {"category_id": 15, "poly": [114.0, 1481.0, 719.0, 1481.0, 719.0, 1509.0, 114.0, 1509.0], "score": 0.99, "text": "Table 5 also includes estimates on the total and low flow"}, {"category_id": 15, "poly": [114.0, 1511.0, 719.0, 1511.0, 719.0, 1542.0, 114.0, 1542.0], "score": 0.99, "text": "reductions calculated from this study. Results from Pine"}, {"category_id": 15, "poly": [114.0, 1548.0, 719.0, 1548.0, 719.0, 1578.0, 114.0, 1578.0], "score": 0.98, "text": "Creek and Traralgon Creek are not included in Table 5"}, {"category_id": 15, "poly": [114.0, 1582.0, 722.0, 1582.0, 722.0, 1612.0, 114.0, 1612.0], "score": 1.0, "text": "as these catchments are not paired. Exact comparisons"}, {"category_id": 15, "poly": [116.0, 1615.0, 719.0, 1615.0, 719.0, 1645.0, 116.0, 1645.0], "score": 0.99, "text": "are impossible because of the rainfall variability, and"}, {"category_id": 15, "poly": [116.0, 1649.0, 717.0, 1649.0, 717.0, 1679.0, 116.0, 1679.0], "score": 0.99, "text": "lack of calibration period for Redhill. Despite this,"}, {"category_id": 15, "poly": [116.0, 1681.0, 719.0, 1681.0, 719.0, 1711.0, 116.0, 1711.0], "score": 0.95, "text": "Table 5 shows that total and low flow reductions"}, {"category_id": 15, "poly": [116.0, 1716.0, 719.0, 1716.0, 719.0, 1746.0, 116.0, 1746.0], "score": 0.99, "text": "estimated from our study are comparable to the results"}, {"category_id": 15, "poly": [116.0, 1750.0, 719.0, 1750.0, 719.0, 1780.0, 116.0, 1780.0], "score": 0.99, "text": "from paired catchment studies, indicating that our"}, {"category_id": 15, "poly": [116.0, 1782.0, 719.0, 1782.0, 719.0, 1812.0, 116.0, 1812.0], "score": 0.99, "text": "simple model has successfully removed the rainfall"}, {"category_id": 15, "poly": [111.0, 1820.0, 190.0, 1814.0, 192.0, 1846.0, 113.0, 1852.0], "score": 0.94, "text": "signal."}, {"category_id": 15, "poly": [112.0, 989.0, 717.0, 987.0, 717.0, 1023.0, 112.0, 1026.0], "score": 1.0, "text": "catchments and the lower deciles at Lambrechtsbos B."}, {"category_id": 15, "poly": [116.0, 1062.0, 715.0, 1062.0, 715.0, 1092.0, 116.0, 1092.0], "score": 0.98, "text": "lower than other published data (Fahey and Jackson,"}, {"category_id": 15, "poly": [116.0, 1092.0, 189.0, 1092.0, 189.0, 1124.0, 116.0, 1124.0], "score": 0.96, "text": "1997)."}, {"category_id": 15, "poly": [116.0, 1028.0, 168.0, 1028.0, 168.0, 1058.0, 116.0, 1058.0], "score": 1.0, "text": "The"}, {"category_id": 15, "poly": [222.0, 1028.0, 715.0, 1028.0, 715.0, 1058.0, 222.0, 1058.0], "score": 1.0, "text": "from Glendhu 2 appears to be substantially"}, {"category_id": 15, "poly": [114.0, 1176.0, 631.0, 1176.0, 631.0, 1213.0, 114.0, 1213.0], "score": 0.99, "text": "4.4. Comparison with paired catchment studies"}, {"category_id": 15, "poly": [816.0, 1686.0, 1381.0, 1686.0, 1381.0, 1716.0, 816.0, 1716.0], "score": 0.98, "text": "The aims of the project have largely been met. The"}, {"category_id": 15, "poly": [782.0, 1720.0, 1383.0, 1720.0, 1383.0, 1750.0, 782.0, 1750.0], "score": 0.97, "text": "general characterisation of FDCs and adjustment for"}, {"category_id": 15, "poly": [782.0, 1750.0, 1385.0, 1750.0, 1385.0, 1787.0, 782.0, 1787.0], "score": 0.99, "text": "climate has been very encouraging given the task of"}, {"category_id": 15, "poly": [782.0, 1784.0, 1381.0, 1784.0, 1381.0, 1815.0, 782.0, 1815.0], "score": 0.98, "text": "fitting our model to 10 flow percentiles, for 10 different"}, {"category_id": 15, "poly": [778.0, 1812.0, 1383.0, 1815.0, 1383.0, 1851.0, 777.0, 1849.0], "score": 0.97, "text": "catchments (resulting in 100 model fits\uff09 with"}, {"category_id": 15, "poly": [816.0, 1060.0, 1381.0, 1060.0, 1381.0, 1090.0, 816.0, 1090.0], "score": 0.96, "text": "As this analysis could only be applied, where there"}, {"category_id": 15, "poly": [782.0, 1094.0, 1383.0, 1094.0, 1383.0, 1124.0, 782.0, 1124.0], "score": 1.0, "text": "was consistent drying up of streams, it was confined to"}, {"category_id": 15, "poly": [782.0, 1127.0, 1381.0, 1127.0, 1381.0, 1157.0, 782.0, 1157.0], "score": 0.99, "text": "Stewarts Creek, Pine Creek and Redhill catchments. The"}, {"category_id": 15, "poly": [780.0, 1228.0, 1383.0, 1228.0, 1383.0, 1256.0, 780.0, 1256.0], "score": 0.99, "text": "significant results at the 0.05 level for both parameters at"}, {"category_id": 15, "poly": [780.0, 1260.0, 1383.0, 1260.0, 1383.0, 1290.0, 780.0, 1290.0], "score": 0.97, "text": "all three catchments. The climate adjusted zero flow"}, {"category_id": 15, "poly": [780.0, 1292.0, 1383.0, 1292.0, 1383.0, 1322.0, 780.0, 1322.0], "score": 1.0, "text": "days are shown in Fig. 5. The increases in zero flow days"}, {"category_id": 15, "poly": [778.0, 1387.0, 1387.0, 1389.0, 1387.0, 1425.0, 777.0, 1423.0], "score": 0.97, "text": "11 at Redhill. The latter has changed from an almost "}, {"category_id": 15, "poly": [775.0, 1423.0, 1385.0, 1421.0, 1385.0, 1458.0, 775.0, 1460.0], "score": 0.97, "text": " permanent to a highly intermittent stream. The curves"}, {"category_id": 15, "poly": [775.0, 1453.0, 1385.0, 1456.0, 1385.0, 1492.0, 775.0, 1490.0], "score": 0.99, "text": " are also in sensible agreement with the flow reductions"}, {"category_id": 15, "poly": [777.0, 1492.0, 885.0, 1492.0, 885.0, 1522.0, 777.0, 1522.0], "score": 0.99, "text": "in Fig. 4."}, {"category_id": 15, "poly": [1278.0, 1193.0, 1383.0, 1193.0, 1383.0, 1223.0, 1278.0, 1223.0], "score": 0.98, "text": " returned"}, {"category_id": 15, "poly": [777.0, 1327.0, 1295.0, 1327.0, 1295.0, 1357.0, 777.0, 1357.0], "score": 0.98, "text": " are substantial with flows confined to less than"}, {"category_id": 15, "poly": [1351.0, 1327.0, 1385.0, 1327.0, 1385.0, 1357.0, 1351.0, 1357.0], "score": 1.0, "text": "of"}, {"category_id": 15, "poly": [780.0, 1161.0, 1077.0, 1161.0, 1077.0, 1189.0, 780.0, 1189.0], "score": 0.96, "text": "model returned values of"}, {"category_id": 15, "poly": [1102.0, 1161.0, 1381.0, 1161.0, 1381.0, 1189.0, 1102.0, 1189.0], "score": 0.98, "text": "of 0.95, 0.99 and 0.99,"}, {"category_id": 15, "poly": [1134.0, 1193.0, 1200.0, 1193.0, 1200.0, 1223.0, 1134.0, 1223.0], "score": 0.97, "text": " and"}, {"category_id": 15, "poly": [780.0, 1193.0, 989.0, 1193.0, 989.0, 1223.0, 780.0, 1223.0], "score": 0.98, "text": "respectively. The"}, {"category_id": 15, "poly": [1004.0, 1193.0, 1112.0, 1193.0, 1112.0, 1223.0, 1004.0, 1223.0], "score": 0.89, "text": "-testson"}, {"category_id": 15, "poly": [1284.0, 1354.0, 1385.0, 1357.0, 1385.0, 1393.0, 1284.0, 1391.0], "score": 1.0, "text": "and year"}, {"category_id": 15, "poly": [775.0, 1354.0, 1106.0, 1357.0, 1106.0, 1393.0, 775.0, 1391.0], "score": 0.98, "text": " the time by year 8 at Stewarts"}, {"category_id": 15, "poly": [1145.0, 1354.0, 1245.0, 1357.0, 1245.0, 1393.0, 1145.0, 1391.0], "score": 1.0, "text": "and Pine"}, {"category_id": 15, "poly": [125.0, 690.0, 1385.0, 694.0, 1385.0, 731.0, 125.0, 727.0], "score": 0.97, "text": "a Rainfall refers to the rainfall in the year used for comparison of results. The value in brackets refers to the deviation from the mean anual"}, {"category_id": 15, "poly": [114.0, 722.0, 408.0, 725.0, 408.0, 755.0, 114.0, 752.0], "score": 1.0, "text": "rainfall for the period of record."}, {"category_id": 15, "poly": [112.0, 832.0, 312.0, 834.0, 311.0, 864.0, 112.0, 862.0], "score": 0.99, "text": " 30-100th percentiles."}, {"category_id": 15, "poly": [125.0, 776.0, 457.0, 780.0, 457.0, 811.0, 125.0, 806.0], "score": 0.96, "text": "c Low flow reduction calculated by"}, {"category_id": 15, "poly": [122.0, 748.0, 463.0, 750.0, 463.0, 780.0, 122.0, 778.0], "score": 0.98, "text": "b Total flow reduction calculated by"}, {"category_id": 15, "poly": [608.0, 748.0, 743.0, 750.0, 743.0, 780.0, 608.0, 778.0], "score": 1.0, "text": "for all deciles."}, {"category_id": 15, "poly": [123.0, 800.0, 374.0, 806.0, 374.0, 843.0, 122.0, 836.0], "score": 0.98, "text": "d For Cathedral Peak 3 the"}, {"category_id": 15, "poly": [1081.0, 800.0, 1385.0, 806.0, 1385.0, 843.0, 1081.0, 836.0], "score": 0.99, "text": "were lower then the values of the"}, {"category_id": 15, "poly": [391.0, 800.0, 430.0, 806.0, 430.0, 843.0, 391.0, 836.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [602.0, 776.0, 772.0, 780.0, 772.0, 811.0, 602.0, 806.0], "score": 0.99, "text": "for 70, 80, 90 and"}, {"category_id": 15, "poly": [828.0, 776.0, 934.0, 780.0, 934.0, 811.0, 828.0, 806.0], "score": 1.0, "text": "percentiles."}, {"category_id": 15, "poly": [449.0, 800.0, 639.0, 806.0, 639.0, 843.0, 449.0, 836.0], "score": 0.98, "text": "values for the 10 and"}, {"category_id": 15, "poly": [685.0, 800.0, 1065.0, 806.0, 1065.0, 843.0, 685.0, 836.0], "score": 0.99, "text": "percentiles were excluded as the values of"}, {"category_id": 15, "poly": [778.0, 987.0, 993.0, 991.0, 992.0, 1030.0, 777.0, 1025.0], "score": 1.0, "text": "4.5. Zero fow days"}], "page_info": {"page_no": 8, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 4, "poly": [130.26629638671875, 1337.3250732421875, 732.2774047851562, 1337.3250732421875, 732.2774047851562, 1418.7119140625, 130.26629638671875, 1418.7119140625], "score": 0.9999995231628418}, {"category_id": 2, "poly": [131.43930053710938, 195.23593139648438, 165.80084228515625, 195.23593139648438, 165.80084228515625, 215.28358459472656, 131.43930053710938, 215.28358459472656], "score": 0.999997615814209}, {"category_id": 3, "poly": [131.99615478515625, 262.13848876953125, 732.1187744140625, 262.13848876953125, 732.1187744140625, 1316.8990478515625, 131.99615478515625, 1316.8990478515625], "score": 0.9999966621398926}, {"category_id": 2, "poly": [480.581787109375, 195.6566162109375, 1043.8951416015625, 195.6566162109375, 1043.8951416015625, 218.63613891601562, 480.581787109375, 218.63613891601562], "score": 0.9999943971633911}, {"category_id": 1, "poly": [794.3954467773438, 1085.8665771484375, 1394.706298828125, 1085.8665771484375, 1394.706298828125, 1847.09423828125, 794.3954467773438, 1847.09423828125], "score": 0.9999863505363464}, {"category_id": 1, "poly": [795.3126831054688, 256.7186279296875, 1395.0584716796875, 256.7186279296875, 1395.0584716796875, 1079.958251953125, 795.3126831054688, 1079.958251953125], "score": 0.9999836683273315}, {"category_id": 1, "poly": [130.74447631835938, 1445.3975830078125, 731.3636474609375, 1445.3975830078125, 731.3636474609375, 1846.5950927734375, 130.74447631835938, 1846.5950927734375], "score": 0.9999815225601196}, {"category_id": 13, "poly": [1045, 452, 1098, 452, 1098, 482, 1045, 482], "score": 0.87, "latex": "27\\%"}, {"category_id": 15, "poly": [129.0, 1339.0, 732.0, 1339.0, 732.0, 1367.0, 129.0, 1367.0], "score": 0.98, "text": "Fig. 5. Number of zero fow days for average rainfall following"}, {"category_id": 15, "poly": [131.0, 1365.0, 730.0, 1365.0, 730.0, 1393.0, 131.0, 1393.0], "score": 0.98, "text": "afforestation for Stewarts Creek 5, Redhill and Pine Creek,"}, {"category_id": 15, "poly": [133.0, 1398.0, 219.0, 1398.0, 219.0, 1421.0, 133.0, 1421.0], "score": 1.0, "text": "Australia."}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 0.97, "text": "262"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [827.0, 1086.0, 1398.0, 1086.0, 1398.0, 1116.0, 827.0, 1116.0], "score": 0.99, "text": "The model fits show we have quantified the net"}, {"category_id": 15, "poly": [793.0, 1120.0, 1396.0, 1120.0, 1396.0, 1150.0, 793.0, 1150.0], "score": 0.98, "text": "impact of afforestation for the majority of the flow"}, {"category_id": 15, "poly": [786.0, 1150.0, 1398.0, 1146.0, 1398.0, 1182.0, 786.0, 1187.0], "score": 0.99, "text": " percentiles in the 10 catchments. Results for the 10-50th"}, {"category_id": 15, "poly": [788.0, 1185.0, 1400.0, 1180.0, 1400.0, 1217.0, 788.0, 1221.0], "score": 0.98, "text": " percentiles were particularly encouraging. It is not"}, {"category_id": 15, "poly": [793.0, 1219.0, 1396.0, 1219.0, 1396.0, 1249.0, 793.0, 1249.0], "score": 0.99, "text": "surprising that the relationship between rainfall and flow"}, {"category_id": 15, "poly": [793.0, 1251.0, 1396.0, 1251.0, 1396.0, 1281.0, 793.0, 1281.0], "score": 0.97, "text": "diminishes at lower fows (60-100th percentile), where"}, {"category_id": 15, "poly": [793.0, 1286.0, 1396.0, 1286.0, 1396.0, 1316.0, 793.0, 1316.0], "score": 0.98, "text": "seasonal storage effects and rainfall distribution become"}, {"category_id": 15, "poly": [788.0, 1318.0, 1396.0, 1318.0, 1396.0, 1348.0, 788.0, 1348.0], "score": 0.97, "text": " more important drivers for runoff generation. The"}, {"category_id": 15, "poly": [790.0, 1352.0, 1398.0, 1352.0, 1398.0, 1382.0, 790.0, 1382.0], "score": 0.99, "text": "poorest model fits were gained for Lambrechtsbos A"}, {"category_id": 15, "poly": [788.0, 1382.0, 1396.0, 1382.0, 1396.0, 1413.0, 788.0, 1413.0], "score": 0.98, "text": " and B. The likely reason at Lambrechtsbos A is an"}, {"category_id": 15, "poly": [793.0, 1419.0, 1398.0, 1419.0, 1398.0, 1447.0, 793.0, 1447.0], "score": 0.98, "text": "observed annual decrease in stand water use after 12"}, {"category_id": 15, "poly": [793.0, 1451.0, 1398.0, 1451.0, 1398.0, 1481.0, 793.0, 1481.0], "score": 0.99, "text": "years (Scott et al., 2000) which does not conform to the"}, {"category_id": 15, "poly": [793.0, 1486.0, 1398.0, 1486.0, 1398.0, 1516.0, 793.0, 1516.0], "score": 0.99, "text": "sigmoidal form of our model over the full 19 years of"}, {"category_id": 15, "poly": [793.0, 1518.0, 1398.0, 1518.0, 1398.0, 1548.0, 793.0, 1548.0], "score": 0.99, "text": "record. The failure of the model to fit the lower flows at"}, {"category_id": 15, "poly": [786.0, 1546.0, 1400.0, 1548.0, 1400.0, 1585.0, 786.0, 1582.0], "score": 0.98, "text": " Lambrechtsbos B is not as explicable. A decrease in"}, {"category_id": 15, "poly": [790.0, 1587.0, 1398.0, 1587.0, 1398.0, 1615.0, 790.0, 1615.0], "score": 1.0, "text": "stand water use in this catchment is observed as the"}, {"category_id": 15, "poly": [793.0, 1619.0, 1400.0, 1619.0, 1400.0, 1649.0, 793.0, 1649.0], "score": 0.99, "text": "plantation ages, but does not occur during the first 20"}, {"category_id": 15, "poly": [793.0, 1651.0, 1398.0, 1651.0, 1398.0, 1681.0, 793.0, 1681.0], "score": 0.99, "text": "years after treatment (Scott et al., 2000). Other data from"}, {"category_id": 15, "poly": [788.0, 1679.0, 1398.0, 1681.0, 1398.0, 1718.0, 788.0, 1716.0], "score": 0.98, "text": " South Africa (Scott et al., 2000) indicate there are"}, {"category_id": 15, "poly": [791.0, 1711.0, 1398.0, 1716.0, 1398.0, 1752.0, 790.0, 1748.0], "score": 0.99, "text": " diminished flow reductions as plantations age, but again,"}, {"category_id": 15, "poly": [795.0, 1752.0, 1396.0, 1752.0, 1396.0, 1782.0, 795.0, 1782.0], "score": 0.99, "text": "generally after 20 years. Our use of an asymptotic curve"}, {"category_id": 15, "poly": [790.0, 1785.0, 1398.0, 1782.0, 1398.0, 1812.0, 790.0, 1815.0], "score": 0.98, "text": "assumes a new equilibrium of stand water use is"}, {"category_id": 15, "poly": [790.0, 1815.0, 1394.0, 1817.0, 1394.0, 1847.0, 790.0, 1845.0], "score": 0.99, "text": "reached. The results of the model fitting generally justify"}, {"category_id": 15, "poly": [788.0, 249.0, 1398.0, 252.0, 1398.0, 288.0, 788.0, 286.0], "score": 0.97, "text": " Lambrechtsbos B appear to be over-estimated by our"}, {"category_id": 15, "poly": [788.0, 284.0, 1398.0, 288.0, 1398.0, 325.0, 788.0, 320.0], "score": 0.99, "text": " model, which is unsurprising as the model fit was poor."}, {"category_id": 15, "poly": [793.0, 322.0, 1398.0, 322.0, 1398.0, 353.0, 793.0, 353.0], "score": 0.98, "text": "The remaining four South African catchments, and also"}, {"category_id": 15, "poly": [793.0, 355.0, 1398.0, 355.0, 1398.0, 385.0, 793.0, 385.0], "score": 0.99, "text": "Redhill and Stewarts Creek are in good agreement with"}, {"category_id": 15, "poly": [793.0, 389.0, 1400.0, 389.0, 1400.0, 417.0, 793.0, 417.0], "score": 1.0, "text": "the published values, particularly when the deviation of"}, {"category_id": 15, "poly": [793.0, 421.0, 1398.0, 421.0, 1398.0, 452.0, 793.0, 452.0], "score": 0.99, "text": "average rainfall is considered. Glendhu 2 reductions are"}, {"category_id": 15, "poly": [788.0, 488.0, 1396.0, 488.0, 1396.0, 518.0, 788.0, 518.0], "score": 0.99, "text": " a heavier impact on the lower flows. Overall, it appears"}, {"category_id": 15, "poly": [788.0, 518.0, 1400.0, 518.0, 1400.0, 555.0, 788.0, 555.0], "score": 0.99, "text": " there are no significant discrepancies with the published"}, {"category_id": 15, "poly": [793.0, 555.0, 1398.0, 555.0, 1398.0, 585.0, 793.0, 585.0], "score": 0.99, "text": "paired catchment analyses. We suggest our technique"}, {"category_id": 15, "poly": [790.0, 589.0, 1398.0, 589.0, 1398.0, 619.0, 790.0, 619.0], "score": 0.99, "text": " represents an alternative to the paired-catchment method"}, {"category_id": 15, "poly": [788.0, 617.0, 1398.0, 619.0, 1398.0, 656.0, 788.0, 654.0], "score": 0.98, "text": "for assessing hydrologic response to vegetation treat-"}, {"category_id": 15, "poly": [788.0, 651.0, 1400.0, 649.0, 1400.0, 686.0, 788.0, 688.0], "score": 1.0, "text": " ment, where paired data are unavailable. The method"}, {"category_id": 15, "poly": [793.0, 688.0, 1396.0, 688.0, 1396.0, 718.0, 793.0, 718.0], "score": 0.99, "text": "has not yet resulted in a predictive model, but has"}, {"category_id": 15, "poly": [795.0, 722.0, 1394.0, 722.0, 1394.0, 752.0, 795.0, 752.0], "score": 1.0, "text": "increased our knowledge of afforestation impacts. This"}, {"category_id": 15, "poly": [793.0, 755.0, 1398.0, 755.0, 1398.0, 785.0, 793.0, 785.0], "score": 0.96, "text": "is a valuable outcome given the contentious issue of"}, {"category_id": 15, "poly": [793.0, 787.0, 1398.0, 787.0, 1398.0, 817.0, 793.0, 817.0], "score": 0.98, "text": "afforestation in Australia and other countries, and a"}, {"category_id": 15, "poly": [792.0, 821.0, 1398.0, 819.0, 1398.0, 849.0, 793.0, 851.0], "score": 0.99, "text": "current paucity of data on inter-annual flows. It should"}, {"category_id": 15, "poly": [788.0, 849.0, 1396.0, 851.0, 1396.0, 888.0, 788.0, 886.0], "score": 0.99, "text": " be noted that nine of the 10 catchment were pine species."}, {"category_id": 15, "poly": [793.0, 888.0, 1398.0, 888.0, 1398.0, 918.0, 793.0, 918.0], "score": 0.98, "text": "More data is required to compare the impact of"}, {"category_id": 15, "poly": [790.0, 920.0, 1390.0, 920.0, 1390.0, 950.0, 790.0, 950.0], "score": 0.99, "text": "hardwood species, particularly eucalypts, on the FDC."}, {"category_id": 15, "poly": [793.0, 950.0, 1396.0, 955.0, 1396.0, 985.0, 792.0, 980.0], "score": 0.98, "text": "Unfortunately these data are currently scarce. There are"}, {"category_id": 15, "poly": [791.0, 980.0, 1398.0, 985.0, 1398.0, 1021.0, 790.0, 1017.0], "score": 1.0, "text": "substantial data on the physiological controls of eucalypt"}, {"category_id": 15, "poly": [793.0, 1019.0, 1396.0, 1019.0, 1396.0, 1049.0, 793.0, 1049.0], "score": 0.99, "text": "water use (see Whitehead and Beadle, 2004), but not at"}, {"category_id": 15, "poly": [790.0, 1054.0, 1016.0, 1054.0, 1016.0, 1084.0, 790.0, 1084.0], "score": 0.98, "text": "the catchment scale."}, {"category_id": 15, "poly": [788.0, 452.0, 1044.0, 452.0, 1044.0, 488.0, 788.0, 488.0], "score": 0.91, "text": " close to the reported "}, {"category_id": 15, "poly": [1099.0, 452.0, 1398.0, 452.0, 1398.0, 488.0, 1099.0, 488.0], "score": 0.97, "text": ", but our model produces"}, {"category_id": 15, "poly": [129.0, 1443.0, 732.0, 1443.0, 732.0, 1479.0, 129.0, 1479.0], "score": 0.99, "text": "substantially varying spatial scales, soils and geology,"}, {"category_id": 15, "poly": [129.0, 1479.0, 732.0, 1479.0, 732.0, 1509.0, 129.0, 1509.0], "score": 0.99, "text": "species planted and climatic environments. Although"}, {"category_id": 15, "poly": [127.0, 1511.0, 735.0, 1511.0, 735.0, 1548.0, 127.0, 1548.0], "score": 0.98, "text": "there were poor results for individual deciles, the FDCs "}, {"category_id": 15, "poly": [129.0, 1548.0, 732.0, 1548.0, 732.0, 1578.0, 129.0, 1578.0], "score": 0.99, "text": "at eight of the 10 catchments were adequately described"}, {"category_id": 15, "poly": [127.0, 1580.0, 730.0, 1578.0, 730.0, 1608.0, 127.0, 1610.0], "score": 0.99, "text": "by Eq. (2). The results of the statistical tests in which the"}, {"category_id": 15, "poly": [129.0, 1612.0, 726.0, 1612.0, 726.0, 1643.0, 129.0, 1643.0], "score": 0.99, "text": "rainfall term was significant for most deciles demon-"}, {"category_id": 15, "poly": [127.0, 1647.0, 735.0, 1647.0, 735.0, 1683.0, 127.0, 1683.0], "score": 0.99, "text": "strated the model structure was appropriate for adjusting"}, {"category_id": 15, "poly": [129.0, 1683.0, 730.0, 1683.0, 730.0, 1711.0, 129.0, 1711.0], "score": 0.96, "text": "the FDCs for climatic (rainfall) variability. The"}, {"category_id": 15, "poly": [129.0, 1716.0, 732.0, 1716.0, 732.0, 1746.0, 129.0, 1746.0], "score": 0.95, "text": "comparisons of our results with published paired"}, {"category_id": 15, "poly": [129.0, 1750.0, 732.0, 1750.0, 732.0, 1780.0, 129.0, 1780.0], "score": 0.98, "text": "catchment analyses are satisfactory, although the"}, {"category_id": 15, "poly": [129.0, 1784.0, 735.0, 1784.0, 735.0, 1812.0, 129.0, 1812.0], "score": 0.98, "text": "different methodologies make direct comparisons of"}, {"category_id": 15, "poly": [127.0, 1815.0, 737.0, 1817.0, 737.0, 1847.0, 127.0, 1845.0], "score": 0.98, "text": "deciles with total fow uncertain. Low flows at"}], "page_info": {"page_no": 9, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 2, "poly": [466.4325256347656, 194.8888397216797, 1031.1922607421875, 194.8888397216797, 1031.1922607421875, 219.64439392089844, 466.4325256347656, 219.64439392089844], "score": 0.9999977350234985}, {"category_id": 0, "poly": [781.4110107421875, 1350.9322509765625, 1112.086181640625, 1350.9322509765625, 1112.086181640625, 1380.4071044921875, 781.4110107421875, 1380.4071044921875], "score": 0.9999973773956299}, {"category_id": 1, "poly": [118.7479248046875, 587.0300903320312, 715.7766723632812, 587.0300903320312, 715.7766723632812, 883.0694580078125, 118.7479248046875, 883.0694580078125], "score": 0.9999968409538269}, {"category_id": 1, "poly": [118.48811340332031, 252.823486328125, 715.749267578125, 252.823486328125, 715.749267578125, 583.28515625, 118.48811340332031, 583.28515625], "score": 0.9999964237213135}, {"category_id": 1, "poly": [117.62772369384766, 885.8139038085938, 717.3323974609375, 885.8139038085938, 717.3323974609375, 1415.2767333984375, 117.62772369384766, 1415.2767333984375], "score": 0.9999961853027344}, {"category_id": 1, "poly": [782.490234375, 254.01434326171875, 1380.5517578125, 254.01434326171875, 1380.5517578125, 748.8712768554688, 782.490234375, 748.8712768554688], "score": 0.9999944567680359}, {"category_id": 1, "poly": [117.28860473632812, 1415.5831298828125, 716.8341064453125, 1415.5831298828125, 716.8341064453125, 1847.5146484375, 117.28860473632812, 1847.5146484375], "score": 0.9999933242797852}, {"category_id": 1, "poly": [781.5156860351562, 752.15576171875, 1380.3497314453125, 752.15576171875, 1380.3497314453125, 1279.9158935546875, 781.5156860351562, 1279.9158935546875], "score": 0.9999922513961792}, {"category_id": 1, "poly": [781.4845581054688, 1417.2979736328125, 1380.813232421875, 1417.2979736328125, 1380.813232421875, 1845.5704345703125, 781.4845581054688, 1845.5704345703125], "score": 0.9999920725822449}, {"category_id": 2, "poly": [1346.2413330078125, 196.15005493164062, 1380.82568359375, 196.15005493164062, 1380.82568359375, 216.4473876953125, 1346.2413330078125, 216.4473876953125], "score": 0.9999884366989136}, {"category_id": 13, "poly": [510, 1017, 563, 1017, 563, 1047, 510, 1047], "score": 0.89, "latex": "85\\%"}, {"category_id": 13, "poly": [1121, 321, 1143, 321, 1143, 347, 1121, 347], "score": 0.55, "latex": "E"}, {"category_id": 13, "poly": [433, 354, 456, 354, 456, 380, 433, 380], "score": 0.47, "latex": "E."}, {"category_id": 13, "poly": [578, 1018, 683, 1018, 683, 1048, 578, 1048], "score": 0.39, "latex": "1260\\,\\mathrm{mm}"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [780.0, 1350.0, 1117.0, 1350.0, 1117.0, 1387.0, 780.0, 1387.0], "score": 0.99, "text": "6. Summary and conclusions"}, {"category_id": 15, "poly": [150.0, 585.0, 717.0, 585.0, 717.0, 615.0, 150.0, 615.0], "score": 0.98, "text": "The small Australian catchments converted to pine in"}, {"category_id": 15, "poly": [109.0, 619.0, 724.0, 617.0, 724.0, 654.0, 110.0, 656.0], "score": 1.0, "text": "response group 1 (Stewarts Creek 5, Pine Creek and"}, {"category_id": 15, "poly": [112.0, 649.0, 722.0, 651.0, 722.0, 688.0, 112.0, 686.0], "score": 0.99, "text": "Redhill) have similar shallow soils, potential evapo-"}, {"category_id": 15, "poly": [116.0, 688.0, 717.0, 688.0, 717.0, 718.0, 116.0, 718.0], "score": 0.99, "text": "transpiration and rainfall distribution (relatively uni-"}, {"category_id": 15, "poly": [116.0, 722.0, 717.0, 722.0, 717.0, 752.0, 116.0, 752.0], "score": 1.0, "text": "form) although Stewarts Creek is significantly wetter."}, {"category_id": 15, "poly": [116.0, 755.0, 719.0, 755.0, 719.0, 785.0, 116.0, 785.0], "score": 0.98, "text": "The combination of small catchment area and the"}, {"category_id": 15, "poly": [116.0, 787.0, 717.0, 787.0, 717.0, 817.0, 116.0, 817.0], "score": 0.99, "text": "increased transpirative demand that exceeds summer"}, {"category_id": 15, "poly": [114.0, 819.0, 720.0, 821.0, 719.0, 851.0, 114.0, 849.0], "score": 1.0, "text": "and autumn rainfall and stored water results in the large"}, {"category_id": 15, "poly": [116.0, 856.0, 638.0, 856.0, 638.0, 884.0, 116.0, 884.0], "score": 0.98, "text": "impact on lower flows, compared to high flows."}, {"category_id": 15, "poly": [114.0, 252.0, 722.0, 252.0, 722.0, 288.0, 114.0, 288.0], "score": 0.98, "text": "this assumption for the length of commercial plantation"}, {"category_id": 15, "poly": [114.0, 286.0, 719.0, 286.0, 719.0, 322.0, 114.0, 322.0], "score": 0.99, "text": "growth (up to 20 years) considered here. The physio-"}, {"category_id": 15, "poly": [116.0, 322.0, 719.0, 322.0, 719.0, 353.0, 116.0, 353.0], "score": 0.99, "text": "logical relationship between stand age and water use for"}, {"category_id": 15, "poly": [114.0, 385.0, 719.0, 385.0, 719.0, 421.0, 114.0, 421.0], "score": 1.0, "text": "thoroughly investigated, although Cornish and Vertessy"}, {"category_id": 15, "poly": [112.0, 417.0, 722.0, 419.0, 722.0, 456.0, 112.0, 454.0], "score": 0.99, "text": "(2001) and Roberts et al. (2001) have shown young"}, {"category_id": 15, "poly": [116.0, 456.0, 719.0, 456.0, 719.0, 486.0, 116.0, 486.0], "score": 1.0, "text": "mixed species eucalypt forests may use more water than"}, {"category_id": 15, "poly": [112.0, 484.0, 722.0, 486.0, 722.0, 522.0, 112.0, 520.0], "score": 0.99, "text": " mature stands, and Putahena and Cordery (2000) suggest "}, {"category_id": 15, "poly": [116.0, 522.0, 719.0, 522.0, 719.0, 553.0, 116.0, 553.0], "score": 0.99, "text": "maximum Pinus radiata water use may have been"}, {"category_id": 15, "poly": [114.0, 553.0, 655.0, 553.0, 655.0, 583.0, 114.0, 583.0], "score": 0.99, "text": "reached after 12 years, with a subsequent decline."}, {"category_id": 15, "poly": [116.0, 355.0, 432.0, 355.0, 432.0, 385.0, 116.0, 385.0], "score": 1.0, "text": "plantation species other than"}, {"category_id": 15, "poly": [457.0, 355.0, 719.0, 355.0, 719.0, 385.0, 457.0, 385.0], "score": 0.96, "text": "regnans have not been"}, {"category_id": 15, "poly": [146.0, 881.0, 720.0, 884.0, 719.0, 920.0, 146.0, 918.0], "score": 0.98, "text": " The magnitude of the response within Group 2 varies"}, {"category_id": 15, "poly": [116.0, 920.0, 717.0, 920.0, 717.0, 950.0, 116.0, 950.0], "score": 1.0, "text": "considerably, with greater reduction in flows in the two"}, {"category_id": 15, "poly": [116.0, 952.0, 717.0, 952.0, 717.0, 983.0, 116.0, 983.0], "score": 0.98, "text": "Cathedral Peak catchments, and Lambrechtsbos B."}, {"category_id": 15, "poly": [114.0, 987.0, 719.0, 987.0, 719.0, 1017.0, 114.0, 1017.0], "score": 0.99, "text": "Potential evaporation is in phase with rainfall at the"}, {"category_id": 15, "poly": [114.0, 1054.0, 722.0, 1054.0, 722.0, 1084.0, 114.0, 1084.0], "score": 0.98, "text": "average) of their rainfall in summer. The conjunction of"}, {"category_id": 15, "poly": [116.0, 1088.0, 722.0, 1088.0, 722.0, 1118.0, 116.0, 1118.0], "score": 0.99, "text": "peak demand and plant water availability may explain"}, {"category_id": 15, "poly": [114.0, 1118.0, 722.0, 1120.0, 722.0, 1150.0, 114.0, 1148.0], "score": 0.98, "text": "the high reductions relative to the remaining catchments "}, {"category_id": 15, "poly": [116.0, 1155.0, 722.0, 1155.0, 722.0, 1185.0, 116.0, 1185.0], "score": 0.98, "text": "in Group 2. In addition, the stocking density was"}, {"category_id": 15, "poly": [116.0, 1187.0, 717.0, 1187.0, 717.0, 1215.0, 116.0, 1215.0], "score": 0.97, "text": "described as \u2018abnormally dense\u2019 by Scott et al. (2000)."}, {"category_id": 15, "poly": [116.0, 1219.0, 719.0, 1219.0, 719.0, 1249.0, 116.0, 1249.0], "score": 1.0, "text": "Growth at Glendhu 2 was notably slow (Fahey and"}, {"category_id": 15, "poly": [112.0, 1247.0, 722.0, 1249.0, 722.0, 1286.0, 112.0, 1284.0], "score": 0.98, "text": " Jackson, 1997) and Lambrechtsbos A and Biesievlei are"}, {"category_id": 15, "poly": [116.0, 1286.0, 719.0, 1286.0, 719.0, 1316.0, 116.0, 1316.0], "score": 1.0, "text": "described as being within sub optimal growth zones"}, {"category_id": 15, "poly": [116.0, 1318.0, 719.0, 1318.0, 719.0, 1348.0, 116.0, 1348.0], "score": 0.99, "text": "(Scott and Smith, 1997) characterised by these authors"}, {"category_id": 15, "poly": [114.0, 1352.0, 719.0, 1352.0, 719.0, 1382.0, 114.0, 1382.0], "score": 0.99, "text": "as having relatively slow response times and lesser"}, {"category_id": 15, "poly": [109.0, 1383.0, 586.0, 1380.0, 586.0, 1417.0, 110.0, 1419.0], "score": 0.96, "text": " reductions that those at more optimal sites."}, {"category_id": 15, "poly": [112.0, 1017.0, 509.0, 1017.0, 509.0, 1054.0, 112.0, 1054.0], "score": 0.98, "text": " Cathedral Peak sites as they receive"}, {"category_id": 15, "poly": [684.0, 1017.0, 719.0, 1017.0, 719.0, 1054.0, 684.0, 1054.0], "score": 0.93, "text": "on"}, {"category_id": 15, "poly": [810.0, 249.0, 1383.0, 252.0, 1383.0, 288.0, 810.0, 286.0], "score": 1.0, "text": "Traralgon Creek would be expected to have both the"}, {"category_id": 15, "poly": [780.0, 286.0, 1385.0, 286.0, 1385.0, 322.0, 780.0, 322.0], "score": 0.98, "text": "most subdued flow reductions and longer response time"}, {"category_id": 15, "poly": [780.0, 355.0, 1385.0, 355.0, 1385.0, 385.0, 780.0, 385.0], "score": 0.97, "text": "uncertain vegetation record. Peak stand water use of a"}, {"category_id": 15, "poly": [777.0, 389.0, 1381.0, 389.0, 1381.0, 419.0, 777.0, 419.0], "score": 0.99, "text": "natural stand of this species is around 30 years."}, {"category_id": 15, "poly": [782.0, 421.0, 1383.0, 421.0, 1383.0, 452.0, 782.0, 452.0], "score": 0.98, "text": "Additionally in this large, \u2018real world\u2019 catchment,"}, {"category_id": 15, "poly": [780.0, 456.0, 1385.0, 456.0, 1385.0, 486.0, 780.0, 486.0], "score": 0.99, "text": "there is a continuous cycle of forest management"}, {"category_id": 15, "poly": [782.0, 488.0, 1385.0, 488.0, 1385.0, 518.0, 782.0, 518.0], "score": 0.98, "text": "which includes harvesting. A mixture of pasture and"}, {"category_id": 15, "poly": [784.0, 522.0, 1383.0, 522.0, 1383.0, 553.0, 784.0, 553.0], "score": 0.99, "text": "'scrub', which could represent significant understorey"}, {"category_id": 15, "poly": [780.0, 555.0, 1381.0, 555.0, 1381.0, 585.0, 780.0, 585.0], "score": 0.99, "text": "stands, were replaced by plantation species. Conse-"}, {"category_id": 15, "poly": [780.0, 589.0, 1385.0, 589.0, 1385.0, 619.0, 780.0, 619.0], "score": 0.99, "text": "quently the difference between pre and post treatment"}, {"category_id": 15, "poly": [777.0, 619.0, 1383.0, 619.0, 1383.0, 649.0, 777.0, 649.0], "score": 0.98, "text": "ET may be less than at other catchments. Reductions of"}, {"category_id": 15, "poly": [777.0, 651.0, 1385.0, 651.0, 1385.0, 688.0, 777.0, 688.0], "score": 1.0, "text": "this magnitude could be more readily expected in larger,"}, {"category_id": 15, "poly": [780.0, 688.0, 1383.0, 688.0, 1383.0, 718.0, 780.0, 718.0], "score": 0.99, "text": "multi land use catchments than the very high impacts"}, {"category_id": 15, "poly": [782.0, 720.0, 1293.0, 720.0, 1293.0, 750.0, 782.0, 750.0], "score": 0.98, "text": "estimated at the smaller Australian catchments."}, {"category_id": 15, "poly": [780.0, 322.0, 1120.0, 322.0, 1120.0, 353.0, 780.0, 353.0], "score": 0.93, "text": "because of the large area of "}, {"category_id": 15, "poly": [1144.0, 322.0, 1385.0, 322.0, 1385.0, 353.0, 1144.0, 353.0], "score": 0.99, "text": "regnans forest, and"}, {"category_id": 15, "poly": [146.0, 1412.0, 720.0, 1415.0, 719.0, 1451.0, 146.0, 1449.0], "score": 0.98, "text": " The response groups may be in part explained by the"}, {"category_id": 15, "poly": [109.0, 1449.0, 722.0, 1447.0, 722.0, 1483.0, 110.0, 1486.0], "score": 0.96, "text": "storage characteristics of the catchments. Accurate"}, {"category_id": 15, "poly": [114.0, 1486.0, 717.0, 1486.0, 717.0, 1516.0, 114.0, 1516.0], "score": 0.99, "text": "measures of storage are not available from the literature,"}, {"category_id": 15, "poly": [114.0, 1518.0, 719.0, 1518.0, 719.0, 1546.0, 114.0, 1546.0], "score": 0.98, "text": "but the soil depths and the baseflow index (Table 1) both"}, {"category_id": 15, "poly": [114.0, 1552.0, 722.0, 1552.0, 722.0, 1580.0, 114.0, 1580.0], "score": 0.97, "text": "show the three south eastern Australian catchments with"}, {"category_id": 15, "poly": [114.0, 1587.0, 722.0, 1587.0, 722.0, 1617.0, 114.0, 1617.0], "score": 0.98, "text": "the greatest reduction are likely to have the lowest"}, {"category_id": 15, "poly": [109.0, 1617.0, 722.0, 1615.0, 722.0, 1651.0, 110.0, 1653.0], "score": 0.99, "text": " storage capacity. The greater flow reductions, particu-"}, {"category_id": 15, "poly": [116.0, 1651.0, 717.0, 1651.0, 717.0, 1681.0, 116.0, 1681.0], "score": 0.97, "text": "larly for low flows, could be expected under these"}, {"category_id": 15, "poly": [116.0, 1686.0, 719.0, 1686.0, 719.0, 1716.0, 116.0, 1716.0], "score": 0.99, "text": "conditions. Inclusion of a storage term in the model is an"}, {"category_id": 15, "poly": [116.0, 1718.0, 719.0, 1718.0, 719.0, 1748.0, 116.0, 1748.0], "score": 0.99, "text": "obvious option for improving the analysis. However the"}, {"category_id": 15, "poly": [116.0, 1752.0, 719.0, 1752.0, 719.0, 1782.0, 116.0, 1782.0], "score": 0.98, "text": "addition of extra parameters would be at the cost of"}, {"category_id": 15, "poly": [116.0, 1784.0, 717.0, 1784.0, 717.0, 1815.0, 116.0, 1815.0], "score": 0.99, "text": "maintaining model simplicity, particularly as character-"}, {"category_id": 15, "poly": [116.0, 1817.0, 518.0, 1817.0, 518.0, 1847.0, 116.0, 1847.0], "score": 1.0, "text": "ising a transient storage is not trivial."}, {"category_id": 15, "poly": [816.0, 755.0, 1381.0, 755.0, 1381.0, 785.0, 816.0, 785.0], "score": 0.97, "text": "The analysis of zero flow days was successful,"}, {"category_id": 15, "poly": [782.0, 787.0, 1383.0, 787.0, 1383.0, 817.0, 782.0, 817.0], "score": 0.99, "text": "demonstrating that the impact on flow intermittence can"}, {"category_id": 15, "poly": [780.0, 819.0, 1383.0, 819.0, 1383.0, 849.0, 780.0, 849.0], "score": 1.0, "text": "be evaluated without of the entire FDC. This was helpful"}, {"category_id": 15, "poly": [777.0, 854.0, 1381.0, 854.0, 1381.0, 884.0, 777.0, 884.0], "score": 0.98, "text": " as the change in the higher percentiles (low flows) could"}, {"category_id": 15, "poly": [782.0, 886.0, 1381.0, 886.0, 1381.0, 916.0, 782.0, 916.0], "score": 0.97, "text": "not always be modelled. The results for the three"}, {"category_id": 15, "poly": [782.0, 920.0, 1381.0, 920.0, 1381.0, 950.0, 782.0, 950.0], "score": 0.98, "text": "catchments analysed are a rather stark indication of the"}, {"category_id": 15, "poly": [782.0, 955.0, 1383.0, 955.0, 1383.0, 985.0, 782.0, 985.0], "score": 0.99, "text": "potential for highly increased zero flow periods in small"}, {"category_id": 15, "poly": [782.0, 987.0, 1381.0, 987.0, 1381.0, 1017.0, 782.0, 1017.0], "score": 0.99, "text": "catchments, at least in south-eastern Australia. However,"}, {"category_id": 15, "poly": [778.0, 1015.0, 1383.0, 1019.0, 1383.0, 1054.0, 777.0, 1049.0], "score": 0.99, "text": "it should be noted these curves probably represent a"}, {"category_id": 15, "poly": [777.0, 1054.0, 1383.0, 1054.0, 1383.0, 1084.0, 777.0, 1084.0], "score": 0.97, "text": " maximum response as they are all derived from small"}, {"category_id": 15, "poly": [777.0, 1088.0, 1381.0, 1088.0, 1381.0, 1118.0, 777.0, 1118.0], "score": 0.97, "text": "catchments with small storage capacities and large"}, {"category_id": 15, "poly": [780.0, 1120.0, 1385.0, 1120.0, 1385.0, 1150.0, 780.0, 1150.0], "score": 0.99, "text": "percentages of afforestation. This method could be used"}, {"category_id": 15, "poly": [777.0, 1150.0, 1381.0, 1150.0, 1381.0, 1180.0, 777.0, 1180.0], "score": 1.0, "text": "to determine change in the occurrence of any given flow"}, {"category_id": 15, "poly": [780.0, 1187.0, 1383.0, 1187.0, 1383.0, 1217.0, 780.0, 1217.0], "score": 0.98, "text": "in response to afforestation; e.g. to determine the"}, {"category_id": 15, "poly": [775.0, 1215.0, 1387.0, 1217.0, 1387.0, 1253.0, 775.0, 1251.0], "score": 0.96, "text": "likelihood of maintaining a reservoir storage or an"}, {"category_id": 15, "poly": [782.0, 1253.0, 1381.0, 1253.0, 1381.0, 1284.0, 782.0, 1284.0], "score": 1.0, "text": "environmental fow that requires an average critical flow."}, {"category_id": 15, "poly": [810.0, 1412.0, 1385.0, 1417.0, 1385.0, 1453.0, 810.0, 1449.0], "score": 0.98, "text": "This project sought to (i) develop a method to remove"}, {"category_id": 15, "poly": [780.0, 1453.0, 1383.0, 1453.0, 1383.0, 1481.0, 780.0, 1481.0], "score": 1.0, "text": "the climate signal from streamflow records to identify"}, {"category_id": 15, "poly": [780.0, 1486.0, 1383.0, 1486.0, 1383.0, 1516.0, 780.0, 1516.0], "score": 0.98, "text": "the impact of vegetation on flow from afforested"}, {"category_id": 15, "poly": [780.0, 1518.0, 1385.0, 1518.0, 1385.0, 1548.0, 780.0, 1548.0], "score": 1.0, "text": "catchments, and (ii) quantify this impact on the flow"}, {"category_id": 15, "poly": [777.0, 1550.0, 1383.0, 1550.0, 1383.0, 1580.0, 777.0, 1580.0], "score": 0.99, "text": "duration curve. A simple model was proposed that"}, {"category_id": 15, "poly": [775.0, 1582.0, 1387.0, 1582.0, 1387.0, 1619.0, 775.0, 1619.0], "score": 0.98, "text": " considered the age of plantation and the annual rainfall"}, {"category_id": 15, "poly": [777.0, 1619.0, 1385.0, 1619.0, 1385.0, 1649.0, 777.0, 1649.0], "score": 0.98, "text": "to be the principal drivers for evapotranspiration. This"}, {"category_id": 15, "poly": [780.0, 1651.0, 1385.0, 1651.0, 1385.0, 1679.0, 780.0, 1679.0], "score": 0.99, "text": "model was fitted to the observed deciles of the FDC, and"}, {"category_id": 15, "poly": [778.0, 1681.0, 1381.0, 1686.0, 1381.0, 1716.0, 777.0, 1711.0], "score": 0.97, "text": "the climate signal was then removed from the stream-"}, {"category_id": 15, "poly": [780.0, 1716.0, 1387.0, 1716.0, 1387.0, 1752.0, 780.0, 1752.0], "score": 0.99, "text": "flow records by adjusting the FDC for average rainfall"}, {"category_id": 15, "poly": [777.0, 1748.0, 1385.0, 1746.0, 1385.0, 1782.0, 778.0, 1785.0], "score": 0.98, "text": "over the period of record. The model was tested and"}, {"category_id": 15, "poly": [777.0, 1780.0, 1381.0, 1780.0, 1381.0, 1817.0, 777.0, 1817.0], "score": 0.99, "text": "applied to 10 afforested catchments. We successfully"}, {"category_id": 15, "poly": [778.0, 1810.0, 1385.0, 1815.0, 1385.0, 1851.0, 777.0, 1847.0], "score": 1.0, "text": "fitted our model to catchments with varying spatial"}, {"category_id": 15, "poly": [1342.0, 189.0, 1387.0, 189.0, 1387.0, 234.0, 1342.0, 234.0], "score": 1.0, "text": "263"}], "page_info": {"page_no": 10, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [132.2023162841797, 944.2422485351562, 352.39361572265625, 944.2422485351562, 352.39361572265625, 973.2088623046875, 132.2023162841797, 973.2088623046875], "score": 0.9999986886978149}, {"category_id": 2, "poly": [480.25787353515625, 196.37384033203125, 1044.1737060546875, 196.37384033203125, 1044.1737060546875, 218.59146118164062, 480.25787353515625, 218.59146118164062], "score": 0.9999973177909851}, {"category_id": 1, "poly": [131.0494842529297, 255.921875, 730.36865234375, 255.921875, 730.36865234375, 848.8026123046875, 131.0494842529297, 848.8026123046875], "score": 0.9999949336051941}, {"category_id": 1, "poly": [130.6344757080078, 1010.9166870117188, 730.247314453125, 1010.9166870117188, 730.247314453125, 1437.8150634765625, 130.6344757080078, 1437.8150634765625], "score": 0.9999939203262329}, {"category_id": 2, "poly": [130.68336486816406, 196.22927856445312, 164.99691772460938, 196.22927856445312, 164.99691772460938, 215.17306518554688, 130.68336486816406, 215.17306518554688], "score": 0.9999921321868896}, {"category_id": 1, "poly": [131.60971069335938, 1597.386962890625, 732.5863647460938, 1597.386962890625, 732.5863647460938, 1846.581787109375, 131.60971069335938, 1846.581787109375], "score": 0.9999908208847046}, {"category_id": 1, "poly": [791.6022338867188, 251.6699676513672, 1397.9674072265625, 251.6699676513672, 1397.9674072265625, 1848.8499755859375, 791.6022338867188, 1848.8499755859375], "score": 0.9999874830245972}, {"category_id": 0, "poly": [131.00613403320312, 1534.647705078125, 256.1180725097656, 1534.647705078125, 256.1180725097656, 1561.1875, 131.00613403320312, 1561.1875], "score": 0.9999844431877136}, {"category_id": 13, "poly": [1067, 1022, 1120, 1022, 1120, 1049, 1067, 1049], "score": 0.57, "latex": "219{\\mathrm{~p~}}"}, {"category_id": 15, "poly": [129.0, 939.0, 357.0, 944.0, 356.0, 983.0, 129.0, 978.0], "score": 1.0, "text": "Acknowledgements"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [127.0, 252.0, 734.0, 249.0, 735.0, 286.0, 127.0, 288.0], "score": 0.99, "text": "scales, species and environments, and have shown that it"}, {"category_id": 15, "poly": [129.0, 290.0, 732.0, 290.0, 732.0, 320.0, 129.0, 320.0], "score": 0.99, "text": "provides a means of separating the influence of climate"}, {"category_id": 15, "poly": [129.0, 322.0, 732.0, 322.0, 732.0, 353.0, 129.0, 353.0], "score": 0.99, "text": "and vegetation on the FDCs. The modelled results"}, {"category_id": 15, "poly": [129.0, 353.0, 730.0, 353.0, 730.0, 383.0, 129.0, 383.0], "score": 0.97, "text": "showed the greatest proportional impacts were for"}, {"category_id": 15, "poly": [129.0, 387.0, 732.0, 387.0, 732.0, 417.0, 129.0, 417.0], "score": 0.99, "text": "median and lower flows. The flow reductions from the"}, {"category_id": 15, "poly": [129.0, 421.0, 732.0, 421.0, 732.0, 449.0, 129.0, 449.0], "score": 0.98, "text": "three small catchments SE Australian were the highest"}, {"category_id": 15, "poly": [129.0, 456.0, 735.0, 456.0, 735.0, 486.0, 129.0, 486.0], "score": 0.99, "text": "and may reflect lower storages. The characterisation of"}, {"category_id": 15, "poly": [129.0, 488.0, 732.0, 488.0, 732.0, 518.0, 129.0, 518.0], "score": 0.99, "text": "the number of zero flow days was also successful for"}, {"category_id": 15, "poly": [129.0, 522.0, 735.0, 522.0, 735.0, 553.0, 129.0, 553.0], "score": 0.98, "text": "these catchments in indicating a significant increase in"}, {"category_id": 15, "poly": [129.0, 555.0, 730.0, 555.0, 730.0, 585.0, 129.0, 585.0], "score": 0.99, "text": "zero flows. The flow reductions identified here probably"}, {"category_id": 15, "poly": [127.0, 589.0, 732.0, 589.0, 732.0, 619.0, 127.0, 619.0], "score": 0.97, "text": "represent a maximum effect given the size of the"}, {"category_id": 15, "poly": [129.0, 619.0, 728.0, 619.0, 728.0, 649.0, 129.0, 649.0], "score": 0.99, "text": "catchments, level of afforestation and the shallow soils."}, {"category_id": 15, "poly": [129.0, 654.0, 732.0, 654.0, 732.0, 684.0, 129.0, 684.0], "score": 0.98, "text": "These results have yielded useful new insights on the"}, {"category_id": 15, "poly": [129.0, 688.0, 735.0, 688.0, 735.0, 718.0, 129.0, 718.0], "score": 0.99, "text": "contentious issue of the hydrological impact of"}, {"category_id": 15, "poly": [129.0, 720.0, 732.0, 720.0, 732.0, 750.0, 129.0, 750.0], "score": 0.99, "text": "afforestation. This research has led to the development"}, {"category_id": 15, "poly": [129.0, 755.0, 732.0, 755.0, 732.0, 785.0, 129.0, 785.0], "score": 0.99, "text": "of a method to assess the net impact of afforestation on"}, {"category_id": 15, "poly": [129.0, 787.0, 730.0, 787.0, 730.0, 817.0, 129.0, 817.0], "score": 0.99, "text": "the fow duration curve which does not require paired-"}, {"category_id": 15, "poly": [127.0, 819.0, 591.0, 821.0, 591.0, 851.0, 127.0, 849.0], "score": 0.98, "text": "catchments to remove climatic variability."}, {"category_id": 15, "poly": [163.0, 1010.0, 730.0, 1010.0, 730.0, 1041.0, 163.0, 1041.0], "score": 0.98, "text": "The authors would like to thank Rory Nathan,"}, {"category_id": 15, "poly": [127.0, 1041.0, 735.0, 1041.0, 735.0, 1077.0, 127.0, 1077.0], "score": 1.0, "text": "Narendra Tuteja, Tom McMahon, Geoff Podger, Rob"}, {"category_id": 15, "poly": [131.0, 1077.0, 728.0, 1077.0, 728.0, 1107.0, 131.0, 1107.0], "score": 0.99, "text": "Vertessy, Glen Walker and Peter Hairsine for particu-"}, {"category_id": 15, "poly": [129.0, 1109.0, 732.0, 1109.0, 732.0, 1140.0, 129.0, 1140.0], "score": 0.99, "text": "larly helpful discussions on methodologies and reviews,"}, {"category_id": 15, "poly": [127.0, 1142.0, 732.0, 1144.0, 732.0, 1174.0, 127.0, 1172.0], "score": 1.0, "text": "Richard Morton for valuable statistical advice, Dave"}, {"category_id": 15, "poly": [129.0, 1178.0, 730.0, 1178.0, 730.0, 1208.0, 129.0, 1208.0], "score": 1.0, "text": "Scott for supplying the South African data, Barry Fahey"}, {"category_id": 15, "poly": [127.0, 1208.0, 732.0, 1210.0, 732.0, 1241.0, 127.0, 1238.0], "score": 0.99, "text": "for the New Zealand data, and Hancocks Victorian"}, {"category_id": 15, "poly": [127.0, 1241.0, 730.0, 1243.0, 730.0, 1273.0, 127.0, 1271.0], "score": 0.99, "text": "Plantations for vegetation data. The study was funded by"}, {"category_id": 15, "poly": [129.0, 1277.0, 735.0, 1277.0, 735.0, 1307.0, 129.0, 1307.0], "score": 0.97, "text": "the Victorian Department of Natural Resources and"}, {"category_id": 15, "poly": [129.0, 1312.0, 732.0, 1312.0, 732.0, 1339.0, 129.0, 1339.0], "score": 0.98, "text": "Environment Private Forestry Unit, the CRC for"}, {"category_id": 15, "poly": [129.0, 1344.0, 735.0, 1344.0, 735.0, 1374.0, 129.0, 1374.0], "score": 1.0, "text": "Catchment Hydrology, and the MDBC funded project"}, {"category_id": 15, "poly": [127.0, 1372.0, 735.0, 1374.0, 734.0, 1410.0, 127.0, 1408.0], "score": 0.99, "text": "\u201cIntegrated assessment of the effects of land use changes"}, {"category_id": 15, "poly": [127.0, 1410.0, 558.0, 1408.0, 558.0, 1438.0, 127.0, 1441.0], "score": 0.98, "text": "on water yield and salt loads\u2019 (D2013)."}, {"category_id": 15, "poly": [127.0, 189.0, 170.0, 189.0, 170.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "264"}, {"category_id": 15, "poly": [127.0, 1593.0, 732.0, 1598.0, 732.0, 1632.0, 127.0, 1627.0], "score": 0.97, "text": "Bosch, J.M., 1979. Treatment effects on annual and dry period"}, {"category_id": 15, "poly": [157.0, 1625.0, 732.0, 1623.0, 732.0, 1653.0, 157.0, 1656.0], "score": 0.99, "text": " streamflow at Cathedral Peak. South African Forestry Journal 108,"}, {"category_id": 15, "poly": [161.0, 1651.0, 230.0, 1651.0, 230.0, 1681.0, 161.0, 1681.0], "score": 0.99, "text": "29-37."}, {"category_id": 15, "poly": [127.0, 1681.0, 732.0, 1681.0, 732.0, 1709.0, 127.0, 1709.0], "score": 0.97, "text": "Bosch, J.M., Von Gadow, K., 1990. Regulating afforestation for water"}, {"category_id": 15, "poly": [159.0, 1709.0, 730.0, 1707.0, 730.0, 1737.0, 159.0, 1739.0], "score": 0.98, "text": " conservation in South Africa. Suid-Afrikaanse Bosboutydskrif 153,"}, {"category_id": 15, "poly": [163.0, 1739.0, 228.0, 1739.0, 228.0, 1763.0, 163.0, 1763.0], "score": 1.0, "text": "41-54."}, {"category_id": 15, "poly": [127.0, 1763.0, 735.0, 1765.0, 734.0, 1795.0, 127.0, 1793.0], "score": 0.97, "text": "Chiew, F.H.S., McMahon, T.A., 1993. Assessing the adequacy of"}, {"category_id": 15, "poly": [161.0, 1793.0, 735.0, 1793.0, 735.0, 1821.0, 161.0, 1821.0], "score": 0.97, "text": "catchment streamflow yield estimates. Australian Journal of Soil"}, {"category_id": 15, "poly": [163.0, 1819.0, 365.0, 1819.0, 365.0, 1847.0, 163.0, 1847.0], "score": 1.0, "text": "Research 31, 665-680."}, {"category_id": 15, "poly": [791.0, 251.0, 1398.0, 256.0, 1398.0, 286.0, 790.0, 282.0], "score": 0.99, "text": "Cornish, P.M., Vertessy, R.A., 2001. Forest age-induced changes in"}, {"category_id": 15, "poly": [823.0, 284.0, 1398.0, 282.0, 1398.0, 312.0, 823.0, 314.0], "score": 0.98, "text": " evapotranspiration and water yield in a eucalypt forest. Journal of"}, {"category_id": 15, "poly": [825.0, 312.0, 1033.0, 312.0, 1033.0, 342.0, 825.0, 342.0], "score": 1.0, "text": "Hydrology 242, 43-63."}, {"category_id": 15, "poly": [788.0, 338.0, 1398.0, 340.0, 1398.0, 370.0, 788.0, 368.0], "score": 1.0, "text": "Fahey, B., Jackson, R., 1997. Hydrological impacts of converting"}, {"category_id": 15, "poly": [820.0, 366.0, 1398.0, 363.0, 1398.0, 400.0, 821.0, 402.0], "score": 0.97, "text": "native forests and grasslands to pine plantations, South"}, {"category_id": 15, "poly": [821.0, 393.0, 1396.0, 396.0, 1396.0, 428.0, 820.0, 426.0], "score": 0.98, "text": " Island, New Zealand. Agricultural and Forest Meteorology 84,"}, {"category_id": 15, "poly": [825.0, 424.0, 889.0, 424.0, 889.0, 454.0, 825.0, 454.0], "score": 1.0, "text": "69-82."}, {"category_id": 15, "poly": [788.0, 451.0, 1396.0, 454.0, 1396.0, 484.0, 788.0, 482.0], "score": 0.99, "text": "Hickel, K., 2001. The effect of pine afforestation on flow regime in"}, {"category_id": 15, "poly": [823.0, 479.0, 1398.0, 484.0, 1398.0, 512.0, 822.0, 507.0], "score": 0.97, "text": "small upland catchments. Masters Thesis, University of Stuttgart,"}, {"category_id": 15, "poly": [820.0, 510.0, 889.0, 505.0, 892.0, 537.0, 822.0, 543.0], "score": 0.94, "text": "p. 134."}, {"category_id": 15, "poly": [790.0, 540.0, 1396.0, 540.0, 1396.0, 568.0, 790.0, 568.0], "score": 1.0, "text": "Holmes, J.W., Sinclair, J.A., 1986. Water yield from some afforested"}, {"category_id": 15, "poly": [825.0, 570.0, 1398.0, 570.0, 1398.0, 598.0, 825.0, 598.0], "score": 0.98, "text": "catchments in Victoria. In Hydrology and Water Resources"}, {"category_id": 15, "poly": [825.0, 596.0, 1398.0, 596.0, 1398.0, 626.0, 825.0, 626.0], "score": 0.99, "text": "Symposium, Griffth University, Brisbane 25-27 November 1986,"}, {"category_id": 15, "poly": [820.0, 626.0, 939.0, 619.0, 941.0, 649.0, 822.0, 656.0], "score": 0.95, "text": "pp. 214-218."}, {"category_id": 15, "poly": [790.0, 654.0, 1398.0, 654.0, 1398.0, 682.0, 790.0, 682.0], "score": 0.99, "text": "Lane, P.N.J., Best, A.E., Hickel, K., Zhang, L., 2003. The effect"}, {"category_id": 15, "poly": [825.0, 682.0, 1398.0, 682.0, 1398.0, 710.0, 825.0, 710.0], "score": 0.99, "text": "of afforestation on flow duration curves. Cooperative Research"}, {"category_id": 15, "poly": [825.0, 710.0, 1396.0, 710.0, 1396.0, 740.0, 825.0, 740.0], "score": 0.97, "text": "Centre for Catchment Hydrology Technical Report O3/13,"}, {"category_id": 15, "poly": [820.0, 745.0, 884.0, 739.0, 886.0, 763.0, 822.0, 768.0], "score": 0.96, "text": "p.25."}, {"category_id": 15, "poly": [790.0, 768.0, 1396.0, 768.0, 1396.0, 798.0, 790.0, 798.0], "score": 0.98, "text": "Legates, D.R., McCabe, G.J., 1999. Evaluating the use of 'goodness-"}, {"category_id": 15, "poly": [825.0, 796.0, 1396.0, 796.0, 1396.0, 823.0, 825.0, 823.0], "score": 0.98, "text": "of-fit\u2019 measures in hydrologic and hydroclimatic model validation."}, {"category_id": 15, "poly": [825.0, 823.0, 1181.0, 823.0, 1181.0, 851.0, 825.0, 851.0], "score": 1.0, "text": "Water Resources Research 35, 233-241."}, {"category_id": 15, "poly": [790.0, 851.0, 1398.0, 851.0, 1398.0, 882.0, 790.0, 882.0], "score": 0.98, "text": "Lyne, V.D., Hollick, M., 1979. Stochastic time-varying rainfall-runoff"}, {"category_id": 15, "poly": [825.0, 882.0, 1398.0, 882.0, 1398.0, 912.0, 825.0, 912.0], "score": 1.0, "text": "modelling. Hydrology and Water Resources Symposium, Perth."}, {"category_id": 15, "poly": [825.0, 909.0, 1224.0, 909.0, 1224.0, 940.0, 825.0, 940.0], "score": 0.98, "text": "Institution of Engineers, Australia, pp. 89-92."}, {"category_id": 15, "poly": [788.0, 935.0, 1398.0, 937.0, 1398.0, 968.0, 788.0, 965.0], "score": 0.98, "text": "Nandakumar, N., Mein, R.G., 1993. Analysis of paired catchment data"}, {"category_id": 15, "poly": [825.0, 965.0, 1398.0, 965.0, 1398.0, 995.0, 825.0, 995.0], "score": 1.0, "text": "to determine the hydrologic effects of changes in vegetative cover"}, {"category_id": 15, "poly": [827.0, 995.0, 1396.0, 995.0, 1396.0, 1026.0, 827.0, 1026.0], "score": 0.99, "text": "on yield. Technical Report for Project UM010, Monash University"}, {"category_id": 15, "poly": [788.0, 1049.0, 1398.0, 1051.0, 1398.0, 1081.0, 788.0, 1079.0], "score": 0.98, "text": "Nash, J.E., Sutcliffe, J.V., 1970. River fow forecasting through"}, {"category_id": 15, "poly": [825.0, 1079.0, 1400.0, 1079.0, 1400.0, 1109.0, 825.0, 1109.0], "score": 0.97, "text": "conceptual models, I, A discussion of principals. Journal of"}, {"category_id": 15, "poly": [825.0, 1109.0, 1042.0, 1109.0, 1042.0, 1137.0, 825.0, 1137.0], "score": 1.0, "text": "Hydrology 10, 282-290."}, {"category_id": 15, "poly": [790.0, 1137.0, 1398.0, 1137.0, 1398.0, 1165.0, 790.0, 1165.0], "score": 0.98, "text": "Putahena, W.M., Cordery, I., 2000. Some hydrological effects of"}, {"category_id": 15, "poly": [827.0, 1165.0, 1398.0, 1165.0, 1398.0, 1195.0, 827.0, 1195.0], "score": 0.99, "text": "changing forest cover from eucalyptus to Pinus radiata. Agricul-"}, {"category_id": 15, "poly": [825.0, 1193.0, 1192.0, 1193.0, 1192.0, 1223.0, 825.0, 1223.0], "score": 0.99, "text": "tural and Forest Meteorology 100, 59-72."}, {"category_id": 15, "poly": [790.0, 1223.0, 1398.0, 1223.0, 1398.0, 1253.0, 790.0, 1253.0], "score": 0.99, "text": "Roberts, S., Vertessy, R.A., Grayson, R.G., 2001. Transpiration from"}, {"category_id": 15, "poly": [825.0, 1251.0, 1400.0, 1251.0, 1400.0, 1281.0, 825.0, 1281.0], "score": 0.99, "text": "Eucalyptus sieberi (L. Johnson) forests of different age. Forest "}, {"category_id": 15, "poly": [825.0, 1279.0, 1183.0, 1279.0, 1183.0, 1309.0, 825.0, 1309.0], "score": 1.0, "text": "Ecology and Management 143, 153-161."}, {"category_id": 15, "poly": [788.0, 1305.0, 1398.0, 1307.0, 1398.0, 1337.0, 788.0, 1335.0], "score": 0.99, "text": "Scott, D.F., Smith, R.E., 1997. Preliminary empirical models to predict"}, {"category_id": 15, "poly": [823.0, 1333.0, 1398.0, 1335.0, 1398.0, 1365.0, 823.0, 1363.0], "score": 0.98, "text": "reductions in total and low flows resulting from afforestation."}, {"category_id": 15, "poly": [825.0, 1363.0, 1046.0, 1363.0, 1046.0, 1393.0, 825.0, 1393.0], "score": 0.99, "text": "Water S.A. 23, 135-140."}, {"category_id": 15, "poly": [790.0, 1393.0, 1398.0, 1393.0, 1398.0, 1421.0, 790.0, 1421.0], "score": 0.97, "text": "Scott, D.F., Prinsloo, F.W., Moses, G., Mehlomakulu, M.,"}, {"category_id": 15, "poly": [825.0, 1421.0, 1398.0, 1421.0, 1398.0, 1449.0, 825.0, 1449.0], "score": 0.97, "text": "Simmers, A.D.A., 2000. Area-analysis of the South African"}, {"category_id": 15, "poly": [825.0, 1449.0, 1398.0, 1449.0, 1398.0, 1479.0, 825.0, 1479.0], "score": 0.96, "text": "catchment afforestation experimental data. WRC Report"}, {"category_id": 15, "poly": [825.0, 1481.0, 954.0, 1481.0, 954.0, 1505.0, 825.0, 1505.0], "score": 0.98, "text": "No. 810/1/00."}, {"category_id": 15, "poly": [790.0, 1507.0, 1396.0, 1507.0, 1396.0, 1535.0, 790.0, 1535.0], "score": 0.98, "text": "Sikka, A.K., Samra, JS., Sharda, V.N., Samraj, P., Lakshmanan, V.,"}, {"category_id": 15, "poly": [825.0, 1535.0, 1400.0, 1535.0, 1400.0, 1565.0, 825.0, 1565.0], "score": 0.98, "text": "2003. Low fow and high responses to converting natural grassland"}, {"category_id": 15, "poly": [827.0, 1561.0, 1400.0, 1561.0, 1400.0, 1591.0, 827.0, 1591.0], "score": 0.99, "text": "into bluegum (Eucalyptus globulus) in Ningiris watersheds of"}, {"category_id": 15, "poly": [825.0, 1591.0, 1235.0, 1591.0, 1235.0, 1621.0, 825.0, 1621.0], "score": 0.99, "text": "South India. Journal of Hydrology 270, 12-26."}, {"category_id": 15, "poly": [790.0, 1621.0, 1398.0, 1621.0, 1398.0, 1651.0, 790.0, 1651.0], "score": 0.98, "text": " Smakhtin, V.U., 1999. A concept of pragmatic hydrological time series "}, {"category_id": 15, "poly": [825.0, 1649.0, 1398.0, 1649.0, 1398.0, 1679.0, 825.0, 1679.0], "score": 0.99, "text": "modelling and its application in South African context. In Ninth"}, {"category_id": 15, "poly": [823.0, 1675.0, 1398.0, 1677.0, 1398.0, 1707.0, 823.0, 1705.0], "score": 0.98, "text": " South African National Hydrology Symposium, 29-30 November"}, {"category_id": 15, "poly": [825.0, 1703.0, 971.0, 1703.0, 971.0, 1739.0, 825.0, 1739.0], "score": 0.99, "text": "1999, pp. 1-11."}, {"category_id": 15, "poly": [790.0, 1735.0, 1398.0, 1735.0, 1398.0, 1765.0, 790.0, 1765.0], "score": 0.98, "text": " Smakhtin, V.U., 2001. Low flow hydrology: a review. Journal of"}, {"category_id": 15, "poly": [825.0, 1763.0, 1052.0, 1763.0, 1052.0, 1793.0, 825.0, 1793.0], "score": 1.0, "text": "Hydrology 240, 147-186."}, {"category_id": 15, "poly": [793.0, 1791.0, 1398.0, 1791.0, 1398.0, 1821.0, 793.0, 1821.0], "score": 0.99, "text": "Van Lill, W.S., Kruger, F.J., Van Wyk, D.B., 1980. The effect of"}, {"category_id": 15, "poly": [827.0, 1819.0, 1398.0, 1819.0, 1398.0, 1849.0, 827.0, 1849.0], "score": 0.98, "text": "afforestation with Eucalyptus grandis Hill ex Maiden and Pinus"}, {"category_id": 15, "poly": [823.0, 1021.0, 1066.0, 1023.0, 1066.0, 1054.0, 822.0, 1051.0], "score": 0.98, "text": " Dept. of Civil Engineering,"}, {"category_id": 15, "poly": [126.0, 1528.0, 260.0, 1533.0, 259.0, 1572.0, 124.0, 1567.0], "score": 1.0, "text": "References"}], "page_info": {"page_no": 11, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 2, "poly": [465.48040771484375, 195.6739959716797, 1032.2401123046875, 195.6739959716797, 1032.2401123046875, 218.9838104248047, 465.48040771484375, 218.9838104248047], "score": 0.9999986886978149}, {"category_id": 1, "poly": [776.9209594726562, 255.59912109375, 1385.6478271484375, 255.59912109375, 1385.6478271484375, 614.4959716796875, 776.9209594726562, 614.4959716796875], "score": 0.9999933242797852}, {"category_id": 2, "poly": [1346.0157470703125, 195.03271484375, 1382.0159912109375, 195.03271484375, 1382.0159912109375, 217.2877960205078, 1346.0157470703125, 217.2877960205078], "score": 0.9999925494194031}, {"category_id": 1, "poly": [116.54571533203125, 257.5740966796875, 716.8768920898438, 257.5740966796875, 716.8768920898438, 615.0397338867188, 116.54571533203125, 615.0397338867188], "score": 0.9999920725822449}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [780.0, 254.0, 1383.0, 254.0, 1383.0, 284.0, 780.0, 284.0], "score": 0.99, "text": "Vogel, R.M., Fennessey, N.M., 1994. Flow duration curves. 1. New"}, {"category_id": 15, "poly": [810.0, 279.0, 1385.0, 282.0, 1385.0, 312.0, 810.0, 310.0], "score": 0.98, "text": " interpretation and confidence intervals. Journal of Water Planning"}, {"category_id": 15, "poly": [814.0, 310.0, 1128.0, 310.0, 1128.0, 340.0, 814.0, 340.0], "score": 0.99, "text": "and Management 120 (4), 485-504."}, {"category_id": 15, "poly": [780.0, 338.0, 1387.0, 338.0, 1387.0, 366.0, 780.0, 366.0], "score": 0.98, "text": "Whitehead, D., Beadle C.L., 2004. Physiological regulation of"}, {"category_id": 15, "poly": [808.0, 361.0, 1387.0, 363.0, 1387.0, 400.0, 807.0, 398.0], "score": 0.98, "text": " productivity and water use in Eucalyptus: a review. Forest Ecology"}, {"category_id": 15, "poly": [812.0, 393.0, 1104.0, 393.0, 1104.0, 424.0, 812.0, 424.0], "score": 1.0, "text": "and Management, 193, 113-140."}, {"category_id": 15, "poly": [777.0, 421.0, 1385.0, 421.0, 1385.0, 449.0, 777.0, 449.0], "score": 0.98, "text": "Zhang, L., Dawes, W.R., Walker, G.R., 1999. Predicting the effect of"}, {"category_id": 15, "poly": [814.0, 449.0, 1383.0, 449.0, 1383.0, 479.0, 814.0, 479.0], "score": 0.99, "text": "vegetation changes on catchment average water balance. Coop-"}, {"category_id": 15, "poly": [812.0, 475.0, 1385.0, 475.0, 1385.0, 505.0, 812.0, 505.0], "score": 0.99, "text": "erative Research Centre for Catchment Hydrology Technical"}, {"category_id": 15, "poly": [810.0, 503.0, 994.0, 503.0, 994.0, 533.0, 810.0, 533.0], "score": 0.99, "text": "Report 99/12, p. 35."}, {"category_id": 15, "poly": [777.0, 531.0, 1385.0, 531.0, 1385.0, 561.0, 777.0, 561.0], "score": 0.97, "text": "Zhang, L., Dawes, W.R., Walker, G.R., 2001. Response of mean"}, {"category_id": 15, "poly": [810.0, 557.0, 1385.0, 559.0, 1385.0, 589.0, 810.0, 587.0], "score": 0.98, "text": " annual evapotranspiration to vegetation changes at catchment"}, {"category_id": 15, "poly": [812.0, 587.0, 1222.0, 587.0, 1222.0, 615.0, 812.0, 615.0], "score": 1.0, "text": "scale. Water Resources Research 37, 701-708."}, {"category_id": 15, "poly": [1342.0, 189.0, 1387.0, 189.0, 1387.0, 234.0, 1342.0, 234.0], "score": 1.0, "text": "265"}, {"category_id": 15, "poly": [148.0, 254.0, 719.0, 254.0, 719.0, 284.0, 148.0, 284.0], "score": 1.0, "text": "patula Schlect. et Cham. on streamflow from experimental"}, {"category_id": 15, "poly": [146.0, 279.0, 720.0, 282.0, 719.0, 312.0, 146.0, 310.0], "score": 0.99, "text": "catchments at Mokubulaan, Transval. Journal of Hydrology 48,"}, {"category_id": 15, "poly": [150.0, 312.0, 234.0, 312.0, 234.0, 335.0, 150.0, 335.0], "score": 1.0, "text": "107-118."}, {"category_id": 15, "poly": [114.0, 338.0, 719.0, 338.0, 719.0, 366.0, 114.0, 366.0], "score": 0.97, "text": "Van Wyk, D.B., 1987. Some effects of afforestation on streamflow"}, {"category_id": 15, "poly": [144.0, 366.0, 719.0, 366.0, 719.0, 396.0, 144.0, 396.0], "score": 0.98, "text": "in the Western Cape Province, South Africa. Water S.A. 13,"}, {"category_id": 15, "poly": [148.0, 396.0, 210.0, 396.0, 210.0, 419.0, 148.0, 419.0], "score": 1.0, "text": "31-36."}, {"category_id": 15, "poly": [114.0, 421.0, 719.0, 421.0, 719.0, 452.0, 114.0, 452.0], "score": 0.98, "text": "Vertessy, R.A., Bessard, Y., 1999. Anticipating the negative"}, {"category_id": 15, "poly": [146.0, 449.0, 722.0, 449.0, 722.0, 479.0, 146.0, 479.0], "score": 0.98, "text": "hydrologic effects of plantation expansion: results from a"}, {"category_id": 15, "poly": [148.0, 475.0, 717.0, 475.0, 717.0, 503.0, 148.0, 503.0], "score": 0.98, "text": "GIS-based analysis on the Murrumbidgee Basin, in: Croke, J.,"}, {"category_id": 15, "poly": [146.0, 503.0, 722.0, 503.0, 722.0, 533.0, 146.0, 533.0], "score": 0.99, "text": "Lane, P.N.J. (Eds.), Forest Management for Water Quality and"}, {"category_id": 15, "poly": [144.0, 527.0, 722.0, 529.0, 722.0, 565.0, 144.0, 563.0], "score": 0.99, "text": "Quantity: Proceedings of the 2nd Erosion in Forests Meeting"}, {"category_id": 15, "poly": [146.0, 557.0, 722.0, 559.0, 722.0, 589.0, 146.0, 587.0], "score": 0.97, "text": " Cooperative Research Centre for Catchment Hydrology, Report "}, {"category_id": 15, "poly": [146.0, 587.0, 301.0, 587.0, 301.0, 617.0, 146.0, 617.0], "score": 0.93, "text": "99/6, Pp. 69-73."}], "page_info": {"page_no": 12, "height": 2064, "width": 1512}}] \ No newline at end of file diff --git a/demo/demo1.pdf b/demo/demo1.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c9405d621e47da5bef2e4c685e8713095bbd4237 Binary files /dev/null and b/demo/demo1.pdf differ diff --git a/demo/demo2.json b/demo/demo2.json new file mode 100644 index 0000000000000000000000000000000000000000..d632d70fafe0b56404f1508f66c7835988ed6ff3 --- /dev/null +++ b/demo/demo2.json @@ -0,0 +1 @@ +[{"layout_dets": [{"category_id": 0, "poly": [282.1632080078125, 156.2249755859375, 1416.6795654296875, 156.2249755859375, 1416.6795654296875, 313.81280517578125, 282.1632080078125, 313.81280517578125], "score": 0.999998927116394}, {"category_id": 1, "poly": [861.656982421875, 522.7763061523438, 1569.3853759765625, 522.7763061523438, 1569.3853759765625, 656.883544921875, 861.656982421875, 656.883544921875], "score": 0.9999970197677612}, {"category_id": 1, "poly": [131.8020782470703, 924.7362670898438, 838.9530639648438, 924.7362670898438, 838.9530639648438, 1323.7529296875, 131.8020782470703, 1323.7529296875], "score": 0.9999949932098389}, {"category_id": 1, "poly": [133.32005310058594, 1324.5035400390625, 839.2289428710938, 1324.5035400390625, 839.2289428710938, 1589.4503173828125, 133.32005310058594, 1589.4503173828125], "score": 0.999994158744812}, {"category_id": 1, "poly": [863.3811645507812, 1486.610107421875, 1569.2880859375, 1486.610107421875, 1569.2880859375, 1852.443603515625, 863.3811645507812, 1852.443603515625], "score": 0.9999936819076538}, {"category_id": 1, "poly": [862.9096069335938, 1187.8067626953125, 1568.2279052734375, 1187.8067626953125, 1568.2279052734375, 1486.08935546875, 862.9096069335938, 1486.08935546875], "score": 0.9999932050704956}, {"category_id": 1, "poly": [131.8186492919922, 1652.7752685546875, 837.5543823242188, 1652.7752685546875, 837.5543823242188, 2019.429443359375, 131.8186492919922, 2019.429443359375], "score": 0.9999901056289673}, {"category_id": 0, "poly": [375.1526794433594, 881.8807983398438, 594.3075561523438, 881.8807983398438, 594.3075561523438, 913.4786987304688, 375.1526794433594, 913.4786987304688], "score": 0.9999892115592957}, {"category_id": 2, "poly": [636.1867065429688, 2099.795654296875, 1063.7423095703125, 2099.795654296875, 1063.7423095703125, 2124.524169921875, 636.1867065429688, 2124.524169921875], "score": 0.9999860525131226}, {"category_id": 0, "poly": [375.91864013671875, 1610.209228515625, 592.8395385742188, 1610.209228515625, 592.8395385742188, 1641.5789794921875, 375.91864013671875, 1641.5789794921875], "score": 0.9999815821647644}, {"category_id": 4, "poly": [860.6583251953125, 995.6574096679688, 1569.622314453125, 995.6574096679688, 1569.622314453125, 1126.8409423828125, 860.6583251953125, 1126.8409423828125], "score": 0.9999815821647644}, {"category_id": 1, "poly": [443.1008605957031, 353.8008728027344, 1250.531494140625, 353.8008728027344, 1250.531494140625, 464.65576171875, 443.1008605957031, 464.65576171875], "score": 0.9999791979789734}, {"category_id": 1, "poly": [130.8282928466797, 523.2079467773438, 836.5639038085938, 523.2079467773438, 836.5639038085938, 862.0206909179688, 130.8282928466797, 862.0206909179688], "score": 0.9999784231185913}, {"category_id": 1, "poly": [862.6514282226562, 1851.426513671875, 1568.510498046875, 1851.426513671875, 1568.510498046875, 2017.93359375, 862.6514282226562, 2017.93359375], "score": 0.9999769926071167}, {"category_id": 3, "poly": [882.3795166015625, 685.376708984375, 1544.4088134765625, 685.376708984375, 1544.4088134765625, 969.22265625, 882.3795166015625, 969.22265625], "score": 0.9994785785675049}, {"category_id": 13, "poly": [1195, 1062, 1226, 1062, 1226, 1096, 1195, 1096], "score": 0.88, "latex": "d_{p}"}, {"category_id": 13, "poly": [1304, 1030, 1327, 1030, 1327, 1061, 1304, 1061], "score": 0.65, "latex": "\\bar{\\bf p}"}, {"category_id": 15, "poly": [344.0, 165.0, 1354.0, 172.0, 1353.0, 236.0, 344.0, 229.0], "score": 0.99, "text": "Real-time Temporal Stereo Matching"}, {"category_id": 15, "poly": [293.0, 254.0, 1402.0, 254.0, 1402.0, 309.0, 293.0, 309.0], "score": 0.99, "text": "using Iterative Adaptive Support Weights"}, {"category_id": 15, "poly": [864.0, 527.0, 1568.0, 527.0, 1568.0, 559.0, 864.0, 559.0], "score": 0.99, "text": "disparity map. Note that individual disparities can be converted"}, {"category_id": 15, "poly": [864.0, 561.0, 1568.0, 561.0, 1568.0, 594.0, 864.0, 594.0], "score": 0.98, "text": "to actual depths if the geometry of the camera setup is"}, {"category_id": 15, "poly": [859.0, 587.0, 1568.0, 591.0, 1568.0, 630.0, 859.0, 626.0], "score": 0.98, "text": " known, i.e., the stereo configuration of cameras has been pre-"}, {"category_id": 15, "poly": [862.0, 626.0, 984.0, 626.0, 984.0, 658.0, 862.0, 658.0], "score": 1.0, "text": "calibrated."}, {"category_id": 15, "poly": [155.0, 921.0, 839.0, 924.0, 838.0, 963.0, 155.0, 960.0], "score": 0.98, "text": " Modern stereo matching algorithms achieve excellent results"}, {"category_id": 15, "poly": [127.0, 956.0, 838.0, 958.0, 838.0, 997.0, 127.0, 995.0], "score": 0.98, "text": " on static stereo images, as demonstrated by the Middlebury"}, {"category_id": 15, "poly": [132.0, 995.0, 836.0, 995.0, 836.0, 1027.0, 132.0, 1027.0], "score": 0.98, "text": "stereo performance benchmark [1], [2]. However, their ap-"}, {"category_id": 15, "poly": [134.0, 1027.0, 834.0, 1027.0, 834.0, 1059.0, 134.0, 1059.0], "score": 1.0, "text": "plication to stereo video sequences does not guarantee inter-"}, {"category_id": 15, "poly": [134.0, 1061.0, 836.0, 1061.0, 836.0, 1093.0, 134.0, 1093.0], "score": 0.99, "text": "frame consistency of matches extracted from subsequent stereo"}, {"category_id": 15, "poly": [132.0, 1095.0, 838.0, 1095.0, 838.0, 1125.0, 132.0, 1125.0], "score": 0.99, "text": "frame pairs. The lack of temporal consistency of matches"}, {"category_id": 15, "poly": [134.0, 1128.0, 836.0, 1128.0, 836.0, 1157.0, 134.0, 1157.0], "score": 1.0, "text": "between successive frames introduces spurious artifacts in the"}, {"category_id": 15, "poly": [132.0, 1160.0, 836.0, 1160.0, 836.0, 1192.0, 132.0, 1192.0], "score": 0.99, "text": "resulting disparity maps. The problem of obtaining temporally"}, {"category_id": 15, "poly": [132.0, 1194.0, 838.0, 1194.0, 838.0, 1226.0, 132.0, 1226.0], "score": 0.98, "text": "consistent sequences of disparity maps from video streams is"}, {"category_id": 15, "poly": [134.0, 1228.0, 838.0, 1228.0, 838.0, 1260.0, 134.0, 1260.0], "score": 0.98, "text": "known as the temporal stereo correspondence problem, yet"}, {"category_id": 15, "poly": [129.0, 1258.0, 841.0, 1260.0, 841.0, 1293.0, 129.0, 1290.0], "score": 0.98, "text": "the amount of research efforts oriented towards finding an"}, {"category_id": 15, "poly": [134.0, 1292.0, 760.0, 1292.0, 760.0, 1325.0, 134.0, 1325.0], "score": 0.99, "text": "effective solution to this problem is surprisingly small."}, {"category_id": 15, "poly": [157.0, 1320.0, 836.0, 1322.0, 836.0, 1361.0, 157.0, 1359.0], "score": 0.98, "text": " A method is proposed for real-time temporal stereo match-"}, {"category_id": 15, "poly": [134.0, 1361.0, 836.0, 1361.0, 836.0, 1393.0, 134.0, 1393.0], "score": 1.0, "text": "ing that efficiently propagates matching cost information be-"}, {"category_id": 15, "poly": [134.0, 1393.0, 836.0, 1393.0, 836.0, 1425.0, 134.0, 1425.0], "score": 0.99, "text": "tween consecutive frames of a stereo video sequence. This"}, {"category_id": 15, "poly": [132.0, 1423.0, 834.0, 1425.0, 834.0, 1458.0, 132.0, 1455.0], "score": 0.98, "text": "method is invariant to the number of prior frames being"}, {"category_id": 15, "poly": [134.0, 1458.0, 836.0, 1458.0, 836.0, 1490.0, 134.0, 1490.0], "score": 0.99, "text": "considered, and can be easily incorporated into any local stereo"}, {"category_id": 15, "poly": [132.0, 1492.0, 836.0, 1492.0, 836.0, 1524.0, 132.0, 1524.0], "score": 0.98, "text": "method based on edge-aware filters. The iterative adaptive"}, {"category_id": 15, "poly": [132.0, 1526.0, 838.0, 1526.0, 838.0, 1558.0, 132.0, 1558.0], "score": 0.99, "text": "support matching algorithm presented in [3] serves as a"}, {"category_id": 15, "poly": [132.0, 1558.0, 557.0, 1558.0, 557.0, 1590.0, 132.0, 1590.0], "score": 0.99, "text": "foundation for the proposed method."}, {"category_id": 15, "poly": [887.0, 1483.0, 1571.0, 1485.0, 1571.0, 1524.0, 887.0, 1522.0], "score": 0.98, "text": " In contrast, local methods, which are typically built upon"}, {"category_id": 15, "poly": [859.0, 1517.0, 1573.0, 1519.0, 1573.0, 1558.0, 859.0, 1556.0], "score": 0.97, "text": " the Winner-Takes-All (WTA) framework, have the property of "}, {"category_id": 15, "poly": [864.0, 1556.0, 1566.0, 1556.0, 1566.0, 1588.0, 864.0, 1588.0], "score": 0.99, "text": "computational regularity and are thus suitable for implemen-"}, {"category_id": 15, "poly": [862.0, 1588.0, 1566.0, 1588.0, 1566.0, 1620.0, 862.0, 1620.0], "score": 1.0, "text": "tation on parallel graphics hardware. Within the WTA frame-"}, {"category_id": 15, "poly": [862.0, 1616.0, 1568.0, 1618.0, 1568.0, 1657.0, 862.0, 1655.0], "score": 0.98, "text": "work, local stereo algorithms consider a range of disparity"}, {"category_id": 15, "poly": [864.0, 1655.0, 1566.0, 1655.0, 1566.0, 1687.0, 864.0, 1687.0], "score": 0.98, "text": "hypotheses and compute a volume of pixel-wise dissimilarity"}, {"category_id": 15, "poly": [862.0, 1689.0, 1571.0, 1689.0, 1571.0, 1721.0, 862.0, 1721.0], "score": 0.99, "text": "metrics between the reference image and the matched image at"}, {"category_id": 15, "poly": [862.0, 1723.0, 1568.0, 1721.0, 1568.0, 1753.0, 862.0, 1755.0], "score": 0.99, "text": "every considered disparity value. Final disparities are chosen"}, {"category_id": 15, "poly": [864.0, 1755.0, 1568.0, 1755.0, 1568.0, 1785.0, 864.0, 1785.0], "score": 1.0, "text": "from the cost volume by traversing through its values and"}, {"category_id": 15, "poly": [866.0, 1788.0, 1568.0, 1788.0, 1568.0, 1820.0, 866.0, 1820.0], "score": 0.99, "text": "selecting the disparities associated with minimum matching"}, {"category_id": 15, "poly": [859.0, 1817.0, 1377.0, 1820.0, 1377.0, 1859.0, 859.0, 1856.0], "score": 0.98, "text": " costs for every pixel of the reference image."}, {"category_id": 15, "poly": [885.0, 1187.0, 1571.0, 1187.0, 1571.0, 1226.0, 885.0, 1226.0], "score": 0.97, "text": " In their excellent taxonomy paper [1], Scharstein and"}, {"category_id": 15, "poly": [864.0, 1224.0, 1566.0, 1224.0, 1566.0, 1254.0, 864.0, 1254.0], "score": 0.99, "text": "Szeliski classify stereo algorithms as local or global meth-"}, {"category_id": 15, "poly": [859.0, 1249.0, 1571.0, 1254.0, 1570.0, 1293.0, 859.0, 1288.0], "score": 0.99, "text": " ods. Global methods, which offer outstanding accuracy, are"}, {"category_id": 15, "poly": [862.0, 1288.0, 1571.0, 1288.0, 1571.0, 1327.0, 862.0, 1327.0], "score": 0.98, "text": "typically derived from an energy minimization framework"}, {"category_id": 15, "poly": [859.0, 1322.0, 1566.0, 1322.0, 1566.0, 1352.0, 859.0, 1352.0], "score": 0.99, "text": "that allows for explicit integration of disparity smoothness"}, {"category_id": 15, "poly": [864.0, 1357.0, 1568.0, 1357.0, 1568.0, 1389.0, 864.0, 1389.0], "score": 0.99, "text": "constraints and thus is capable of regularizing the solution"}, {"category_id": 15, "poly": [864.0, 1391.0, 1568.0, 1391.0, 1568.0, 1421.0, 864.0, 1421.0], "score": 1.0, "text": "in weakly textured areas. The minimization, however, is often"}, {"category_id": 15, "poly": [864.0, 1423.0, 1568.0, 1423.0, 1568.0, 1455.0, 864.0, 1455.0], "score": 0.99, "text": "achieved using iterative methods or graph cuts, which do not"}, {"category_id": 15, "poly": [864.0, 1458.0, 1418.0, 1458.0, 1418.0, 1487.0, 864.0, 1487.0], "score": 0.99, "text": "lend themselves well to parallel implementation."}, {"category_id": 15, "poly": [155.0, 1650.0, 839.0, 1652.0, 838.0, 1691.0, 155.0, 1689.0], "score": 0.97, "text": " Stereo matching is the process of identifying correspon-"}, {"category_id": 15, "poly": [134.0, 1687.0, 838.0, 1687.0, 838.0, 1719.0, 134.0, 1719.0], "score": 0.99, "text": "dences between pixels in stereo images obtained using a"}, {"category_id": 15, "poly": [132.0, 1723.0, 838.0, 1721.0, 838.0, 1753.0, 132.0, 1755.0], "score": 0.98, "text": "pair of synchronized cameras. These correspondences are"}, {"category_id": 15, "poly": [134.0, 1755.0, 836.0, 1755.0, 836.0, 1788.0, 134.0, 1788.0], "score": 0.99, "text": "conveniently represented using the notion of disparity, i.e. the"}, {"category_id": 15, "poly": [134.0, 1788.0, 836.0, 1788.0, 836.0, 1820.0, 134.0, 1820.0], "score": 1.0, "text": "positional offset between two matching pixels. It is assumed"}, {"category_id": 15, "poly": [134.0, 1822.0, 836.0, 1822.0, 836.0, 1854.0, 134.0, 1854.0], "score": 0.99, "text": "that the stereo images are rectified, such that matching pixels"}, {"category_id": 15, "poly": [132.0, 1854.0, 836.0, 1854.0, 836.0, 1886.0, 132.0, 1886.0], "score": 0.99, "text": "are confined within corresponding rows of the images and"}, {"category_id": 15, "poly": [134.0, 1888.0, 838.0, 1888.0, 838.0, 1918.0, 134.0, 1918.0], "score": 1.0, "text": "thus disparities are restricted to the horizontal dimension, as"}, {"category_id": 15, "poly": [134.0, 1920.0, 838.0, 1920.0, 838.0, 1952.0, 134.0, 1952.0], "score": 1.0, "text": "illustrated in Figure 1. For visualization purposes, disparities"}, {"category_id": 15, "poly": [134.0, 1955.0, 838.0, 1955.0, 838.0, 1987.0, 134.0, 1987.0], "score": 0.99, "text": "recovered for every pixel of a reference image are stored"}, {"category_id": 15, "poly": [129.0, 1985.0, 841.0, 1982.0, 841.0, 2021.0, 129.0, 2024.0], "score": 0.98, "text": "together in the form of an image, which is known as the"}, {"category_id": 15, "poly": [370.0, 885.0, 594.0, 885.0, 594.0, 917.0, 370.0, 917.0], "score": 1.0, "text": "1. INTRODUCTION"}, {"category_id": 15, "poly": [638.0, 2099.0, 1062.0, 2099.0, 1062.0, 2131.0, 638.0, 2131.0], "score": 0.98, "text": "978-1-4673-5208-6/13/$31.00 @2013 IEEE"}, {"category_id": 15, "poly": [374.0, 1613.0, 591.0, 1613.0, 591.0, 1645.0, 374.0, 1645.0], "score": 0.95, "text": "II. BACKGROUND"}, {"category_id": 15, "poly": [859.0, 992.0, 1571.0, 995.0, 1571.0, 1034.0, 859.0, 1031.0], "score": 0.99, "text": " Figure 1: Geometry of two horizontally aligned views where p"}, {"category_id": 15, "poly": [864.0, 1098.0, 1291.0, 1098.0, 1291.0, 1130.0, 864.0, 1130.0], "score": 0.99, "text": "them along the horizontal dimension."}, {"category_id": 15, "poly": [859.0, 1061.0, 1194.0, 1059.0, 1194.0, 1098.0, 859.0, 1100.0], "score": 0.98, "text": " pixel in the target frame, and"}, {"category_id": 15, "poly": [1227.0, 1061.0, 1571.0, 1059.0, 1571.0, 1098.0, 1227.0, 1100.0], "score": 0.97, "text": " denotes the disparity between"}, {"category_id": 15, "poly": [864.0, 1034.0, 1303.0, 1034.0, 1303.0, 1063.0, 864.0, 1063.0], "score": 0.99, "text": "denotes a pixel in the reference frame,"}, {"category_id": 15, "poly": [1328.0, 1034.0, 1566.0, 1034.0, 1566.0, 1063.0, 1328.0, 1063.0], "score": 0.96, "text": " denotes its matching"}, {"category_id": 15, "poly": [508.0, 357.0, 1194.0, 360.0, 1194.0, 392.0, 508.0, 390.0], "score": 0.98, "text": "Jedrzej Kowalczuk, Eric T. Psota, and Lance C. P\u00e9rez"}, {"category_id": 15, "poly": [443.0, 392.0, 1245.0, 392.0, 1245.0, 424.0, 443.0, 424.0], "score": 0.99, "text": "Department of Electrical Engineering, University of Nebraska-Lincoln"}, {"category_id": 15, "poly": [614.0, 435.0, 1081.0, 435.0, 1081.0, 465.0, 614.0, 465.0], "score": 0.99, "text": "[jkowalczuk2,epsota,lperez] @unl.edu"}, {"category_id": 15, "poly": [159.0, 527.0, 836.0, 527.0, 836.0, 559.0, 159.0, 559.0], "score": 0.98, "text": "Abstract-Stereo matching algorithms are nearly always de-"}, {"category_id": 15, "poly": [132.0, 555.0, 838.0, 555.0, 838.0, 587.0, 132.0, 587.0], "score": 0.98, "text": "signed to find matches between a single pair of images. A method"}, {"category_id": 15, "poly": [134.0, 580.0, 836.0, 580.0, 836.0, 612.0, 134.0, 612.0], "score": 1.0, "text": "is presented that was specifically designed to operate on sequences"}, {"category_id": 15, "poly": [132.0, 605.0, 838.0, 607.0, 838.0, 646.0, 132.0, 644.0], "score": 0.99, "text": "of images. This method considers the cost of matching image"}, {"category_id": 15, "poly": [132.0, 637.0, 838.0, 637.0, 838.0, 669.0, 132.0, 669.0], "score": 0.98, "text": "points in both the spatial and temporal domain. To maintain"}, {"category_id": 15, "poly": [134.0, 667.0, 838.0, 667.0, 838.0, 699.0, 134.0, 699.0], "score": 0.97, "text": "real-time operation, a temporal cost aggregation method is used"}, {"category_id": 15, "poly": [132.0, 692.0, 836.0, 692.0, 836.0, 722.0, 132.0, 722.0], "score": 0.98, "text": "to evaluate the likelihood of matches that is invariant with respect"}, {"category_id": 15, "poly": [127.0, 717.0, 841.0, 715.0, 841.0, 754.0, 127.0, 756.0], "score": 0.97, "text": "to the number of prior images being considered. This method"}, {"category_id": 15, "poly": [127.0, 742.0, 841.0, 745.0, 841.0, 784.0, 127.0, 781.0], "score": 0.98, "text": "has been implemented on massively parallel GPU hardware,"}, {"category_id": 15, "poly": [132.0, 777.0, 838.0, 777.0, 838.0, 809.0, 132.0, 809.0], "score": 0.99, "text": "and the implementation ranks as one of the fastest and most"}, {"category_id": 15, "poly": [132.0, 802.0, 838.0, 804.0, 838.0, 836.0, 132.0, 834.0], "score": 0.99, "text": "accurate real-time stereo matching methods as measured by the"}, {"category_id": 15, "poly": [134.0, 830.0, 619.0, 830.0, 619.0, 862.0, 134.0, 862.0], "score": 0.99, "text": "Middlebury stereo performance benchmark."}, {"category_id": 15, "poly": [887.0, 1849.0, 1568.0, 1852.0, 1568.0, 1891.0, 887.0, 1888.0], "score": 0.99, "text": " Disparity maps obtained using this simple strategy are often"}, {"category_id": 15, "poly": [862.0, 1888.0, 1568.0, 1888.0, 1568.0, 1920.0, 862.0, 1920.0], "score": 0.98, "text": "too noisy to be considered useable. To reduce the effects"}, {"category_id": 15, "poly": [864.0, 1923.0, 1568.0, 1923.0, 1568.0, 1952.0, 864.0, 1952.0], "score": 0.99, "text": "of noise and enforce spatial consistency of matches, local"}, {"category_id": 15, "poly": [862.0, 1948.0, 1568.0, 1950.0, 1568.0, 1989.0, 861.0, 1987.0], "score": 0.99, "text": "stereo algorithms consider arbitrarily shaped and sized support"}, {"category_id": 15, "poly": [864.0, 1989.0, 1568.0, 1989.0, 1568.0, 2021.0, 864.0, 2021.0], "score": 0.99, "text": "windows centered at each pixel of the reference image, and"}], "page_info": {"page_no": 0, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 8, "poly": [962.3624267578125, 1513.2073974609375, 1465.4017333984375, 1513.2073974609375, 1465.4017333984375, 1669.1397705078125, 962.3624267578125, 1669.1397705078125], "score": 0.9999995231628418}, {"category_id": 9, "poly": [1530.72998046875, 1101.879638671875, 1565.2568359375, 1101.879638671875, 1565.2568359375, 1130.8609619140625, 1530.72998046875, 1130.8609619140625], "score": 0.9999992251396179}, {"category_id": 9, "poly": [1529.8787841796875, 1575.843505859375, 1565.931396484375, 1575.843505859375, 1565.931396484375, 1607.2161865234375, 1529.8787841796875, 1607.2161865234375], "score": 0.9999987483024597}, {"category_id": 1, "poly": [865.1971435546875, 1684.040283203125, 1566.561279296875, 1684.040283203125, 1566.561279296875, 1813.7021484375, 865.1971435546875, 1813.7021484375], "score": 0.9999987483024597}, {"category_id": 9, "poly": [1530.5263671875, 1839.3990478515625, 1565.1201171875, 1839.3990478515625, 1565.1201171875, 1869.825439453125, 1530.5263671875, 1869.825439453125], "score": 0.9999977946281433}, {"category_id": 8, "poly": [972.3255004882812, 1075.85498046875, 1461.2088623046875, 1075.85498046875, 1461.2088623046875, 1155.465087890625, 972.3255004882812, 1155.465087890625], "score": 0.999996542930603}, {"category_id": 1, "poly": [865.4874267578125, 158.47100830078125, 1565.84375, 158.47100830078125, 1565.84375, 355.3230285644531, 865.4874267578125, 355.3230285644531], "score": 0.9999960660934448}, {"category_id": 1, "poly": [133.51382446289062, 158.21670532226562, 835.5382080078125, 158.21670532226562, 835.5382080078125, 558.8020629882812, 133.51382446289062, 558.8020629882812], "score": 0.9999951124191284}, {"category_id": 1, "poly": [134.01239013671875, 954.4151000976562, 836.1470336914062, 954.4151000976562, 836.1470336914062, 1618.77197265625, 134.01239013671875, 1618.77197265625], "score": 0.9999947547912598}, {"category_id": 1, "poly": [134.4542999267578, 558.8201904296875, 834.2548828125, 558.8201904296875, 834.2548828125, 954.7811279296875, 134.4542999267578, 954.7811279296875], "score": 0.9999943971633911}, {"category_id": 1, "poly": [866.33642578125, 421.84442138671875, 1566.451904296875, 421.84442138671875, 1566.451904296875, 787.1864624023438, 866.33642578125, 787.1864624023438], "score": 0.9999930262565613}, {"category_id": 1, "poly": [864.974853515625, 1167.92236328125, 1567.0927734375, 1167.92236328125, 1567.0927734375, 1298.29541015625, 864.974853515625, 1298.29541015625], "score": 0.9999929666519165}, {"category_id": 1, "poly": [864.5220947265625, 853.943359375, 1565.82080078125, 853.943359375, 1565.82080078125, 1080.8125, 864.5220947265625, 1080.8125], "score": 0.9999923706054688}, {"category_id": 1, "poly": [865.4466552734375, 1919.30615234375, 1566.4720458984375, 1919.30615234375, 1566.4720458984375, 2017.154541015625, 865.4466552734375, 2017.154541015625], "score": 0.9999904036521912}, {"category_id": 1, "poly": [864.801513671875, 1302.438232421875, 1566.760986328125, 1302.438232421875, 1566.760986328125, 1498.9681396484375, 864.801513671875, 1498.9681396484375], "score": 0.9999889135360718}, {"category_id": 1, "poly": [133.34628295898438, 1620.0596923828125, 836.7553100585938, 1620.0596923828125, 836.7553100585938, 2018.44873046875, 133.34628295898438, 2018.44873046875], "score": 0.9999861717224121}, {"category_id": 0, "poly": [865.5296020507812, 809.8997802734375, 1302.7711181640625, 809.8997802734375, 1302.7711181640625, 841.3140869140625, 865.5296020507812, 841.3140869140625], "score": 0.9999798536300659}, {"category_id": 0, "poly": [1131.11181640625, 378.66229248046875, 1299.6181640625, 378.66229248046875, 1299.6181640625, 409.04852294921875, 1131.11181640625, 409.04852294921875], "score": 0.9999651908874512}, {"category_id": 8, "poly": [1003.5569458007812, 1824.2362060546875, 1420.7132568359375, 1824.2362060546875, 1420.7132568359375, 1905.175048828125, 1003.5569458007812, 1905.175048828125], "score": 0.999914288520813}, {"category_id": 14, "poly": [974, 1076, 1454, 1076, 1454, 1155, 974, 1155], "score": 0.94, "latex": "w(p,q)=\\exp{\\left(-\\frac{\\Delta_{g}(p,q)}{\\gamma_{g}}-\\frac{\\Delta_{c}(p,q)}{\\gamma_{c}}\\right)},"}, {"category_id": 14, "poly": [1006, 1825, 1423, 1825, 1423, 1907, 1006, 1907], "score": 0.94, "latex": "\\delta(q,\\bar{q})=\\sum_{c=\\{r,g,b\\}}\\operatorname*{min}(|q_{c}-\\bar{q}_{c}|,\\tau)."}, {"category_id": 14, "poly": [963, 1510, 1464, 1510, 1464, 1671, 963, 1671], "score": 0.93, "latex": "C(p,\\bar{p})=\\frac{\\displaystyle\\sum_{q\\in\\Omega_{p},\\bar{q}\\in\\Omega_{\\bar{p}}}w(p,q)w(\\bar{p},\\bar{q})\\delta(q,\\bar{q})}{\\displaystyle\\sum_{q\\in\\Omega_{p},\\bar{q}\\in\\Omega_{\\bar{p}}}w(p,q)w(\\bar{p},\\bar{q})}\\,,"}, {"category_id": 13, "poly": [1335, 1166, 1432, 1166, 1432, 1200, 1335, 1200], "score": 0.93, "latex": "\\Delta_{c}(p,q)"}, {"category_id": 13, "poly": [939, 1166, 1039, 1166, 1039, 1201, 939, 1201], "score": 0.93, "latex": "\\Delta_{g}(p,q)"}, {"category_id": 13, "poly": [1289, 1683, 1365, 1683, 1365, 1717, 1289, 1717], "score": 0.93, "latex": "\\delta(q,\\bar{q})"}, {"category_id": 13, "poly": [1362, 1367, 1441, 1367, 1441, 1401, 1362, 1401], "score": 0.92, "latex": "\\bar{p}\\in S_{p}"}, {"category_id": 13, "poly": [864, 1019, 951, 1019, 951, 1053, 864, 1053], "score": 0.92, "latex": "q\\in\\Omega_{p}"}, {"category_id": 13, "poly": [1351, 953, 1388, 953, 1388, 987, 1351, 987], "score": 0.9, "latex": "\\Omega_{p}"}, {"category_id": 13, "poly": [913, 1467, 949, 1467, 949, 1501, 913, 1501], "score": 0.89, "latex": "\\Omega_{\\bar{p}}"}, {"category_id": 13, "poly": [1531, 1367, 1565, 1367, 1565, 1401, 1531, 1401], "score": 0.89, "latex": "S_{p}"}, {"category_id": 13, "poly": [1528, 1434, 1565, 1434, 1565, 1468, 1528, 1468], "score": 0.89, "latex": "\\Omega_{p}"}, {"category_id": 13, "poly": [1485, 1205, 1516, 1205, 1516, 1234, 1485, 1234], "score": 0.88, "latex": "\\gamma_{g}"}, {"category_id": 13, "poly": [1159, 1206, 1178, 1206, 1178, 1233, 1159, 1233], "score": 0.82, "latex": "p"}, {"category_id": 13, "poly": [863, 1238, 893, 1238, 893, 1266, 863, 1266], "score": 0.82, "latex": "\\gamma_{c}"}, {"category_id": 13, "poly": [1177, 1436, 1196, 1436, 1196, 1465, 1177, 1465], "score": 0.8, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [1371, 1024, 1391, 1024, 1391, 1051, 1371, 1051], "score": 0.8, "latex": "p"}, {"category_id": 13, "poly": [1540, 1406, 1558, 1406, 1558, 1432, 1540, 1432], "score": 0.8, "latex": "p"}, {"category_id": 13, "poly": [1447, 1024, 1465, 1024, 1465, 1051, 1447, 1051], "score": 0.79, "latex": "q"}, {"category_id": 13, "poly": [1101, 1437, 1121, 1437, 1121, 1465, 1101, 1465], "score": 0.79, "latex": "p"}, {"category_id": 13, "poly": [1389, 1307, 1407, 1307, 1407, 1332, 1389, 1332], "score": 0.79, "latex": "p"}, {"category_id": 13, "poly": [1230, 1206, 1247, 1206, 1247, 1233, 1230, 1233], "score": 0.78, "latex": "q"}, {"category_id": 13, "poly": [1029, 1372, 1048, 1372, 1048, 1399, 1029, 1399], "score": 0.78, "latex": "p"}, {"category_id": 13, "poly": [916, 1752, 934, 1752, 934, 1782, 916, 1782], "score": 0.76, "latex": "\\bar{q}"}, {"category_id": 13, "poly": [1407, 1925, 1425, 1925, 1425, 1946, 1407, 1946], "score": 0.75, "latex": "\\tau"}, {"category_id": 13, "poly": [1548, 1722, 1565, 1722, 1565, 1749, 1548, 1749], "score": 0.75, "latex": "q"}, {"category_id": 13, "poly": [1050, 992, 1068, 992, 1068, 1018, 1050, 1018], "score": 0.75, "latex": "p"}, {"category_id": 15, "poly": [864.0, 1783.0, 1298.0, 1783.0, 1298.0, 1822.0, 864.0, 1822.0], "score": 0.99, "text": "green, and blue components given by"}, {"category_id": 15, "poly": [866.0, 1687.0, 1288.0, 1687.0, 1288.0, 1719.0, 866.0, 1719.0], "score": 0.96, "text": "where the pixel dissimilarity metric"}, {"category_id": 15, "poly": [1366.0, 1687.0, 1564.0, 1687.0, 1564.0, 1719.0, 1366.0, 1719.0], "score": 0.97, "text": "ischosen as the"}, {"category_id": 15, "poly": [866.0, 1751.0, 915.0, 1751.0, 915.0, 1783.0, 866.0, 1783.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [935.0, 1751.0, 1564.0, 1751.0, 1564.0, 1783.0, 935.0, 1783.0], "score": 0.98, "text": ". Here, the truncation of color difference for the red,"}, {"category_id": 15, "poly": [866.0, 1719.0, 1547.0, 1719.0, 1547.0, 1749.0, 866.0, 1749.0], "score": 0.99, "text": "sum of truncated absolute color differences between pixels"}, {"category_id": 15, "poly": [864.0, 163.0, 1568.0, 163.0, 1568.0, 192.0, 864.0, 192.0], "score": 1.0, "text": "temporal information, making it possible to process a temporal"}, {"category_id": 15, "poly": [859.0, 188.0, 1571.0, 193.0, 1570.0, 229.0, 859.0, 225.0], "score": 0.99, "text": " collection of cost volumes. The filtering operation was shown"}, {"category_id": 15, "poly": [864.0, 229.0, 1566.0, 229.0, 1566.0, 261.0, 864.0, 261.0], "score": 0.99, "text": "to preserve spatio-temporal edges present in the cost volumes,"}, {"category_id": 15, "poly": [859.0, 261.0, 1564.0, 264.0, 1564.0, 296.0, 859.0, 293.0], "score": 0.98, "text": " resulting in increased temporal consistency of disparity maps,"}, {"category_id": 15, "poly": [864.0, 296.0, 1566.0, 296.0, 1566.0, 328.0, 864.0, 328.0], "score": 0.99, "text": "greater robustness to image noise, and more accurate behavior"}, {"category_id": 15, "poly": [866.0, 328.0, 1160.0, 328.0, 1160.0, 360.0, 866.0, 360.0], "score": 1.0, "text": "around object boundaries."}, {"category_id": 15, "poly": [129.0, 158.0, 841.0, 153.0, 841.0, 192.0, 130.0, 197.0], "score": 0.99, "text": "aggregate cost values within the pixel neighborhoods defined"}, {"category_id": 15, "poly": [129.0, 188.0, 841.0, 190.0, 841.0, 229.0, 129.0, 227.0], "score": 0.99, "text": "by these windows. In 2005, Yoon and Kweon [4] proposed"}, {"category_id": 15, "poly": [132.0, 229.0, 838.0, 229.0, 838.0, 261.0, 132.0, 261.0], "score": 1.0, "text": "an adaptive matching cost aggregation scheme, which assigns"}, {"category_id": 15, "poly": [132.0, 261.0, 838.0, 261.0, 838.0, 293.0, 132.0, 293.0], "score": 0.98, "text": "a weight value to every pixel located in the support window"}, {"category_id": 15, "poly": [132.0, 293.0, 838.0, 293.0, 838.0, 325.0, 132.0, 325.0], "score": 0.98, "text": "of a given pixel of interest. The weight value is based on"}, {"category_id": 15, "poly": [132.0, 328.0, 836.0, 328.0, 836.0, 360.0, 132.0, 360.0], "score": 0.99, "text": "the spatial and color similarity between the pixel of interest"}, {"category_id": 15, "poly": [134.0, 360.0, 836.0, 360.0, 836.0, 392.0, 134.0, 392.0], "score": 1.0, "text": "and a pixel in its support window, and the aggregated cost is"}, {"category_id": 15, "poly": [134.0, 394.0, 836.0, 394.0, 836.0, 426.0, 134.0, 426.0], "score": 0.99, "text": "computed as a weighted average of the pixel-wise costs within"}, {"category_id": 15, "poly": [127.0, 422.0, 839.0, 424.0, 838.0, 463.0, 127.0, 461.0], "score": 0.98, "text": " the considered support window. The edge-preserving nature"}, {"category_id": 15, "poly": [129.0, 456.0, 838.0, 454.0, 838.0, 493.0, 129.0, 495.0], "score": 0.99, "text": " and matching accuracy of adaptive support weights have made"}, {"category_id": 15, "poly": [132.0, 490.0, 841.0, 490.0, 841.0, 529.0, 132.0, 529.0], "score": 0.99, "text": "them one of the most popular choices for cost aggregation in"}, {"category_id": 15, "poly": [132.0, 527.0, 797.0, 527.0, 797.0, 559.0, 132.0, 559.0], "score": 0.97, "text": "recently proposed stereo matching algorithms [3], [5]-[8]."}, {"category_id": 15, "poly": [157.0, 958.0, 836.0, 958.0, 836.0, 988.0, 157.0, 988.0], "score": 0.99, "text": "It has been demonstrated that the performance of stereo"}, {"category_id": 15, "poly": [132.0, 990.0, 838.0, 990.0, 838.0, 1022.0, 132.0, 1022.0], "score": 0.99, "text": "algorithms designed to match a single pair of images can"}, {"category_id": 15, "poly": [132.0, 1024.0, 836.0, 1024.0, 836.0, 1056.0, 132.0, 1056.0], "score": 0.99, "text": "be adapted to take advantage of the temporal dependencies"}, {"category_id": 15, "poly": [129.0, 1054.0, 838.0, 1054.0, 838.0, 1093.0, 129.0, 1093.0], "score": 0.97, "text": "available in stereo video sequences. Early proposed solutions"}, {"category_id": 15, "poly": [132.0, 1091.0, 836.0, 1091.0, 836.0, 1123.0, 132.0, 1123.0], "score": 0.99, "text": "to temporal stereo matching attempted to average matching"}, {"category_id": 15, "poly": [134.0, 1123.0, 836.0, 1123.0, 836.0, 1155.0, 134.0, 1155.0], "score": 0.99, "text": "costs across subsequent frames of a video sequence [13],"}, {"category_id": 15, "poly": [129.0, 1153.0, 841.0, 1150.0, 841.0, 1189.0, 129.0, 1192.0], "score": 0.98, "text": "[14]. Attempts have been made to integrate estimation of"}, {"category_id": 15, "poly": [134.0, 1192.0, 838.0, 1192.0, 838.0, 1224.0, 134.0, 1224.0], "score": 0.99, "text": "motion fields (optical flow) into temporal stereo matching. The"}, {"category_id": 15, "poly": [132.0, 1224.0, 838.0, 1224.0, 838.0, 1256.0, 132.0, 1256.0], "score": 0.99, "text": "methods of [15] and [16] perform smoothing of disparities"}, {"category_id": 15, "poly": [129.0, 1254.0, 841.0, 1254.0, 841.0, 1292.0, 129.0, 1292.0], "score": 0.99, "text": " along motion vectors recovered from the video sequence. The"}, {"category_id": 15, "poly": [132.0, 1290.0, 838.0, 1290.0, 838.0, 1322.0, 132.0, 1322.0], "score": 0.99, "text": "estimation of the motion field, however, prevents real-time"}, {"category_id": 15, "poly": [132.0, 1325.0, 838.0, 1325.0, 838.0, 1354.0, 132.0, 1354.0], "score": 0.99, "text": "implementation, since state-of-the-art optical flow algorithms"}, {"category_id": 15, "poly": [129.0, 1354.0, 841.0, 1354.0, 841.0, 1393.0, 129.0, 1393.0], "score": 0.99, "text": " do not, in general, approach real-time frame rates. In a related"}, {"category_id": 15, "poly": [129.0, 1386.0, 841.0, 1384.0, 841.0, 1423.0, 129.0, 1425.0], "score": 0.99, "text": "approach, Sizintsev and Wildes [17], [18] used steerable"}, {"category_id": 15, "poly": [134.0, 1423.0, 836.0, 1423.0, 836.0, 1455.0, 134.0, 1455.0], "score": 0.99, "text": "filters to obtain descriptors characterizing motion of image"}, {"category_id": 15, "poly": [134.0, 1455.0, 836.0, 1455.0, 836.0, 1487.0, 134.0, 1487.0], "score": 0.99, "text": "features in both space and time. Unlike traditional algorithms,"}, {"category_id": 15, "poly": [132.0, 1490.0, 838.0, 1490.0, 838.0, 1522.0, 132.0, 1522.0], "score": 0.98, "text": "their method performs matching on spatio-temporal motion"}, {"category_id": 15, "poly": [129.0, 1519.0, 841.0, 1517.0, 841.0, 1556.0, 129.0, 1558.0], "score": 0.99, "text": " descriptors, rather than on pure pixel intensity values, which"}, {"category_id": 15, "poly": [132.0, 1554.0, 841.0, 1554.0, 841.0, 1593.0, 132.0, 1593.0], "score": 0.99, "text": "leads to improved temporal coherence of disparity maps at the"}, {"category_id": 15, "poly": [132.0, 1586.0, 698.0, 1586.0, 698.0, 1618.0, 132.0, 1618.0], "score": 0.99, "text": "cost of reduced accuracy at depth discontinuities."}, {"category_id": 15, "poly": [159.0, 559.0, 838.0, 559.0, 838.0, 591.0, 159.0, 591.0], "score": 0.99, "text": "Recently, Rheman et al. [9], [10] have revisited the cost"}, {"category_id": 15, "poly": [132.0, 594.0, 838.0, 589.0, 839.0, 621.0, 132.0, 626.0], "score": 1.0, "text": "aggregation step of stereo algorithms, and demonstrated that"}, {"category_id": 15, "poly": [132.0, 626.0, 838.0, 626.0, 838.0, 658.0, 132.0, 658.0], "score": 0.99, "text": "cost aggregation can be performed by filtering of subsequent"}, {"category_id": 15, "poly": [134.0, 660.0, 834.0, 660.0, 834.0, 692.0, 134.0, 692.0], "score": 1.0, "text": "layers of the initially computed matching cost volume. In par-"}, {"category_id": 15, "poly": [132.0, 692.0, 836.0, 692.0, 836.0, 724.0, 132.0, 724.0], "score": 0.99, "text": "ticular, the edge-aware image filters, such as the bilateral filter"}, {"category_id": 15, "poly": [127.0, 719.0, 839.0, 724.0, 838.0, 761.0, 127.0, 756.0], "score": 0.99, "text": " of Tomasi and Manducci [11] or the guided filter of He [12],"}, {"category_id": 15, "poly": [132.0, 759.0, 838.0, 759.0, 838.0, 791.0, 132.0, 791.0], "score": 0.98, "text": "have been rendered useful for the problem of matching cost"}, {"category_id": 15, "poly": [132.0, 793.0, 838.0, 791.0, 838.0, 823.0, 132.0, 825.0], "score": 0.99, "text": "aggregation, enabling stereo algorithms to correctly recover"}, {"category_id": 15, "poly": [134.0, 825.0, 838.0, 825.0, 838.0, 857.0, 134.0, 857.0], "score": 0.98, "text": "disparities along object boundaries. In fact, Yoon and Kweon's"}, {"category_id": 15, "poly": [134.0, 859.0, 838.0, 859.0, 838.0, 891.0, 134.0, 891.0], "score": 1.0, "text": "adaptive support-weight cost aggregation scheme is equivalent"}, {"category_id": 15, "poly": [132.0, 891.0, 838.0, 891.0, 838.0, 924.0, 132.0, 924.0], "score": 0.98, "text": "to the application of the so-called joint bilateral filter to the"}, {"category_id": 15, "poly": [134.0, 924.0, 547.0, 924.0, 547.0, 956.0, 134.0, 956.0], "score": 1.0, "text": "layers of the matching cost volume."}, {"category_id": 15, "poly": [889.0, 422.0, 1568.0, 424.0, 1568.0, 456.0, 889.0, 454.0], "score": 0.98, "text": "The proposed temporal stereo matching algorithm is an"}, {"category_id": 15, "poly": [862.0, 456.0, 1571.0, 456.0, 1571.0, 495.0, 862.0, 495.0], "score": 1.0, "text": "extension of the real-time iterative adaptive support-weight"}, {"category_id": 15, "poly": [864.0, 490.0, 1568.0, 490.0, 1568.0, 522.0, 864.0, 522.0], "score": 0.99, "text": "algorithm described in [3]. In addition to real-time two-"}, {"category_id": 15, "poly": [864.0, 525.0, 1566.0, 525.0, 1566.0, 557.0, 864.0, 557.0], "score": 1.0, "text": "pass aggregation of the cost values in the spatial domain,"}, {"category_id": 15, "poly": [864.0, 557.0, 1568.0, 557.0, 1568.0, 589.0, 864.0, 589.0], "score": 0.99, "text": "the proposed algorithm enhances stereo matching on video"}, {"category_id": 15, "poly": [866.0, 594.0, 1566.0, 594.0, 1566.0, 626.0, 866.0, 626.0], "score": 0.97, "text": "sequences by aggregating costs along the time dimension."}, {"category_id": 15, "poly": [864.0, 626.0, 1568.0, 626.0, 1568.0, 658.0, 864.0, 658.0], "score": 1.0, "text": "The operation of the algorithm has been divided into four"}, {"category_id": 15, "poly": [866.0, 660.0, 1568.0, 660.0, 1568.0, 692.0, 866.0, 692.0], "score": 0.99, "text": "stages: 1) two-pass spatial cost aggregation, 2) temporal cost"}, {"category_id": 15, "poly": [862.0, 688.0, 1568.0, 685.0, 1568.0, 724.0, 862.0, 727.0], "score": 1.0, "text": "aggregation, 3) disparity selection and confidence assessment,"}, {"category_id": 15, "poly": [866.0, 724.0, 1568.0, 724.0, 1568.0, 756.0, 866.0, 756.0], "score": 1.0, "text": "and 4) iterative disparity refinement. In the following, each of"}, {"category_id": 15, "poly": [864.0, 759.0, 1254.0, 759.0, 1254.0, 791.0, 864.0, 791.0], "score": 1.0, "text": "these stages is described in detail."}, {"category_id": 15, "poly": [860.0, 1265.0, 1194.0, 1270.0, 1194.0, 1306.0, 859.0, 1301.0], "score": 0.99, "text": " color similarity, respectively."}, {"category_id": 15, "poly": [1433.0, 1169.0, 1566.0, 1169.0, 1566.0, 1201.0, 1433.0, 1201.0], "score": 0.98, "text": "is the color"}, {"category_id": 15, "poly": [864.0, 1169.0, 938.0, 1169.0, 938.0, 1201.0, 864.0, 1201.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [1040.0, 1169.0, 1334.0, 1169.0, 1334.0, 1201.0, 1040.0, 1201.0], "score": 0.98, "text": "is the geometric distance,"}, {"category_id": 15, "poly": [1517.0, 1196.0, 1566.0, 1201.0, 1566.0, 1240.0, 1517.0, 1235.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [862.0, 1196.0, 1158.0, 1201.0, 1158.0, 1240.0, 861.0, 1235.0], "score": 1.0, "text": "difference between pixels"}, {"category_id": 15, "poly": [894.0, 1233.0, 1566.0, 1231.0, 1566.0, 1270.0, 894.0, 1272.0], "score": 0.97, "text": "regulate the strength of grouping by geometric distance and"}, {"category_id": 15, "poly": [1179.0, 1196.0, 1229.0, 1201.0, 1229.0, 1240.0, 1179.0, 1235.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [1248.0, 1196.0, 1484.0, 1201.0, 1484.0, 1240.0, 1248.0, 1235.0], "score": 0.99, "text": ", and the coefficients"}, {"category_id": 15, "poly": [887.0, 848.0, 1568.0, 850.0, 1568.0, 889.0, 887.0, 887.0], "score": 0.99, "text": " Humans group shapes by observing the geometric distance"}, {"category_id": 15, "poly": [859.0, 885.0, 1568.0, 882.0, 1568.0, 921.0, 859.0, 924.0], "score": 0.98, "text": " and color similarity of points in space. To mimic this vi-"}, {"category_id": 15, "poly": [864.0, 921.0, 1568.0, 921.0, 1568.0, 953.0, 864.0, 953.0], "score": 0.99, "text": "sual grouping, the adaptive support-weight stereo matching"}, {"category_id": 15, "poly": [864.0, 1054.0, 899.0, 1054.0, 899.0, 1084.0, 864.0, 1084.0], "score": 1.0, "text": "by"}, {"category_id": 15, "poly": [866.0, 956.0, 1350.0, 956.0, 1350.0, 988.0, 866.0, 988.0], "score": 0.98, "text": "algorithm [4] considers a support window"}, {"category_id": 15, "poly": [1389.0, 956.0, 1566.0, 956.0, 1566.0, 988.0, 1389.0, 988.0], "score": 0.98, "text": " centered at the"}, {"category_id": 15, "poly": [952.0, 1022.0, 1370.0, 1022.0, 1370.0, 1054.0, 952.0, 1054.0], "score": 0.98, "text": ". The support weight relating pixels"}, {"category_id": 15, "poly": [1392.0, 1022.0, 1446.0, 1022.0, 1446.0, 1054.0, 1392.0, 1054.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [1466.0, 1022.0, 1566.0, 1022.0, 1566.0, 1054.0, 1466.0, 1054.0], "score": 0.98, "text": "is given"}, {"category_id": 15, "poly": [866.0, 990.0, 1049.0, 990.0, 1049.0, 1022.0, 866.0, 1022.0], "score": 1.0, "text": "pixel of interest"}, {"category_id": 15, "poly": [1069.0, 990.0, 1566.0, 990.0, 1566.0, 1022.0, 1069.0, 1022.0], "score": 1.0, "text": ", and assigns a support weight to each pixel"}, {"category_id": 15, "poly": [862.0, 1948.0, 1568.0, 1950.0, 1568.0, 1989.0, 861.0, 1987.0], "score": 0.98, "text": "vides additional robustness to outliers. Rather than evaluating"}, {"category_id": 15, "poly": [864.0, 1989.0, 1566.0, 1989.0, 1566.0, 2021.0, 864.0, 2021.0], "score": 0.98, "text": "Equation (2) directly, real-time algorithms often approximate"}, {"category_id": 15, "poly": [862.0, 1920.0, 1406.0, 1920.0, 1406.0, 1952.0, 862.0, 1952.0], "score": 0.99, "text": "This limits each of their magnitudes to at most"}, {"category_id": 15, "poly": [1426.0, 1920.0, 1561.0, 1920.0, 1561.0, 1952.0, 1426.0, 1952.0], "score": 0.96, "text": ",whichpro-"}, {"category_id": 15, "poly": [859.0, 1331.0, 1571.0, 1334.0, 1571.0, 1373.0, 859.0, 1370.0], "score": 0.98, "text": " iterative adaptive support-weight algorithm evaluates matching"}, {"category_id": 15, "poly": [859.0, 1464.0, 912.0, 1467.0, 912.0, 1506.0, 859.0, 1503.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [950.0, 1464.0, 1474.0, 1467.0, 1474.0, 1506.0, 950.0, 1503.0], "score": 1.0, "text": ", the initial matching cost is aggregated using"}, {"category_id": 15, "poly": [1442.0, 1370.0, 1530.0, 1370.0, 1530.0, 1402.0, 1442.0, 1402.0], "score": 0.98, "text": ", where"}, {"category_id": 15, "poly": [1197.0, 1437.0, 1527.0, 1437.0, 1527.0, 1469.0, 1197.0, 1469.0], "score": 0.97, "text": ", and their support windows"}, {"category_id": 15, "poly": [866.0, 1402.0, 1539.0, 1402.0, 1539.0, 1435.0, 866.0, 1435.0], "score": 1.0, "text": "denotes a set of matching candidates associated with pixel"}, {"category_id": 15, "poly": [864.0, 1437.0, 1100.0, 1437.0, 1100.0, 1469.0, 864.0, 1469.0], "score": 0.97, "text": "For a pair of pixels"}, {"category_id": 15, "poly": [1122.0, 1437.0, 1176.0, 1437.0, 1176.0, 1469.0, 1122.0, 1469.0], "score": 0.94, "text": " and"}, {"category_id": 15, "poly": [887.0, 1299.0, 1388.0, 1304.0, 1388.0, 1336.0, 887.0, 1331.0], "score": 0.96, "text": " To identify a match for the pixel of interest"}, {"category_id": 15, "poly": [1408.0, 1299.0, 1568.0, 1304.0, 1568.0, 1336.0, 1408.0, 1331.0], "score": 1.0, "text": ", the real-time"}, {"category_id": 15, "poly": [864.0, 1370.0, 1028.0, 1370.0, 1028.0, 1402.0, 864.0, 1402.0], "score": 1.0, "text": "costs between"}, {"category_id": 15, "poly": [1049.0, 1370.0, 1361.0, 1370.0, 1361.0, 1402.0, 1049.0, 1402.0], "score": 0.99, "text": " and every match candidate"}, {"category_id": 15, "poly": [160.0, 1618.0, 836.0, 1623.0, 836.0, 1655.0, 159.0, 1650.0], "score": 0.99, "text": "Most recently, local stereo algorithms based on edge-aware"}, {"category_id": 15, "poly": [127.0, 1650.0, 841.0, 1652.0, 841.0, 1691.0, 127.0, 1689.0], "score": 0.97, "text": " filters were extended to incorporate temporal evidence into"}, {"category_id": 15, "poly": [132.0, 1687.0, 836.0, 1687.0, 836.0, 1719.0, 132.0, 1719.0], "score": 0.97, "text": "the matching process. The method of Richardt et al. [19]"}, {"category_id": 15, "poly": [134.0, 1723.0, 838.0, 1723.0, 838.0, 1753.0, 134.0, 1753.0], "score": 0.99, "text": "employs a variant of the bilateral grid [20] implemented on"}, {"category_id": 15, "poly": [134.0, 1755.0, 838.0, 1755.0, 838.0, 1788.0, 134.0, 1788.0], "score": 0.99, "text": "graphics hardware, which accelerates cost aggregation and"}, {"category_id": 15, "poly": [134.0, 1788.0, 838.0, 1788.0, 838.0, 1820.0, 134.0, 1820.0], "score": 1.0, "text": "allows for weighted propagation of pixel dissimilarity metrics"}, {"category_id": 15, "poly": [132.0, 1822.0, 838.0, 1822.0, 838.0, 1854.0, 132.0, 1854.0], "score": 0.99, "text": "from previous frames to the current one. Although this method"}, {"category_id": 15, "poly": [129.0, 1856.0, 838.0, 1856.0, 838.0, 1888.0, 129.0, 1888.0], "score": 1.0, "text": " outperforms the baseline frame-to-frame approach, the amount"}, {"category_id": 15, "poly": [132.0, 1888.0, 838.0, 1888.0, 838.0, 1920.0, 132.0, 1920.0], "score": 0.97, "text": "of hardware memory necessary to construct the bilateral grid"}, {"category_id": 15, "poly": [127.0, 1916.0, 841.0, 1918.0, 841.0, 1957.0, 127.0, 1955.0], "score": 0.99, "text": "limits its application to single-channel, i.e., grayscale images "}, {"category_id": 15, "poly": [132.0, 1955.0, 838.0, 1955.0, 838.0, 1985.0, 132.0, 1985.0], "score": 0.99, "text": "only. Hosni et al. [10], on the other hand, reformulated kernels"}, {"category_id": 15, "poly": [132.0, 1989.0, 838.0, 1989.0, 838.0, 2021.0, 132.0, 2021.0], "score": 0.99, "text": "of the guided image filter to operate on both spatial and"}, {"category_id": 15, "poly": [859.0, 809.0, 1307.0, 809.0, 1307.0, 848.0, 859.0, 848.0], "score": 0.99, "text": "A. Two-Pass Spatial Cost Aggregation"}, {"category_id": 15, "poly": [1129.0, 376.0, 1300.0, 376.0, 1300.0, 417.0, 1129.0, 417.0], "score": 0.94, "text": "III. METHOD"}], "page_info": {"page_no": 1, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 1, "poly": [865.5088500976562, 856.5537109375, 1567.692626953125, 856.5537109375, 1567.692626953125, 1420.9698486328125, 865.5088500976562, 1420.9698486328125], "score": 0.9999963045120239}, {"category_id": 8, "poly": [281.1294860839844, 1001.0513916015625, 689.37451171875, 1001.0513916015625, 689.37451171875, 1075.8765869140625, 281.1294860839844, 1075.8765869140625], "score": 0.9999961256980896}, {"category_id": 1, "poly": [133.53353881835938, 158.6427459716797, 836.7297973632812, 158.6427459716797, 836.7297973632812, 390.48828125, 133.53353881835938, 390.48828125], "score": 0.9999960660934448}, {"category_id": 8, "poly": [145.77777099609375, 1839.6416015625, 803.4192504882812, 1839.6416015625, 803.4192504882812, 1993.239013671875, 145.77777099609375, 1993.239013671875], "score": 0.9999958872795105}, {"category_id": 1, "poly": [864.9884643554688, 1420.8831787109375, 1567.3118896484375, 1420.8831787109375, 1567.3118896484375, 2023.257080078125, 864.9884643554688, 2023.257080078125], "score": 0.9999951124191284}, {"category_id": 9, "poly": [1529.267333984375, 388.6717834472656, 1565.1744384765625, 388.6717834472656, 1565.1744384765625, 416.4899597167969, 1529.267333984375, 416.4899597167969], "score": 0.9999918937683105}, {"category_id": 9, "poly": [800.3933715820312, 1551.524169921875, 833.2618408203125, 1551.524169921875, 833.2618408203125, 1582.073486328125, 800.3933715820312, 1582.073486328125], "score": 0.9999911189079285}, {"category_id": 1, "poly": [864.3720092773438, 200.97483825683594, 1565.6871337890625, 200.97483825683594, 1565.6871337890625, 365.6230163574219, 864.3720092773438, 365.6230163574219], "score": 0.9999903440475464}, {"category_id": 1, "poly": [134.87628173828125, 1369.5762939453125, 835.0336303710938, 1369.5762939453125, 835.0336303710938, 1533.884765625, 134.87628173828125, 1533.884765625], "score": 0.9999880790710449}, {"category_id": 1, "poly": [134.59988403320312, 444.5299377441406, 836.5606079101562, 444.5299377441406, 836.5606079101562, 709.0791015625, 134.59988403320312, 709.0791015625], "score": 0.999987006187439}, {"category_id": 1, "poly": [134.15472412109375, 1084.4288330078125, 836.2360229492188, 1084.4288330078125, 836.2360229492188, 1314.6600341796875, 134.15472412109375, 1314.6600341796875], "score": 0.9999866485595703}, {"category_id": 9, "poly": [800.6007690429688, 1023.1047973632812, 833.2154541015625, 1023.1047973632812, 833.2154541015625, 1055.7227783203125, 800.6007690429688, 1055.7227783203125], "score": 0.9999839663505554}, {"category_id": 8, "poly": [948.4016723632812, 372.03607177734375, 1486.11279296875, 372.03607177734375, 1486.11279296875, 449.3696594238281, 948.4016723632812, 449.3696594238281], "score": 0.9999831914901733}, {"category_id": 8, "poly": [145.31065368652344, 714.4036254882812, 820.3599853515625, 714.4036254882812, 820.3599853515625, 791.855712890625, 145.31065368652344, 791.855712890625], "score": 0.9999772906303406}, {"category_id": 1, "poly": [863.8760986328125, 599.6033325195312, 1566.84619140625, 599.6033325195312, 1566.84619140625, 797.44189453125, 863.8760986328125, 797.44189453125], "score": 0.999976396560669}, {"category_id": 1, "poly": [864.925537109375, 464.9669189453125, 1565.212158203125, 464.9669189453125, 1565.212158203125, 529.045654296875, 864.925537109375, 529.045654296875], "score": 0.999973475933075}, {"category_id": 1, "poly": [133.88735961914062, 797.7457885742188, 835.5986328125, 797.7457885742188, 835.5986328125, 994.4456176757812, 133.88735961914062, 994.4456176757812], "score": 0.9999661445617676}, {"category_id": 1, "poly": [134.8787841796875, 1615.116455078125, 835.4554443359375, 1615.116455078125, 835.4554443359375, 1815.4564208984375, 134.8787841796875, 1815.4564208984375], "score": 0.9999580383300781}, {"category_id": 9, "poly": [1530.1783447265625, 550.1576538085938, 1564.607177734375, 550.1576538085938, 1564.607177734375, 578.6950073242188, 1530.1783447265625, 578.6950073242188], "score": 0.9999532103538513}, {"category_id": 9, "poly": [801.0740966796875, 738.4259643554688, 834.7449340820312, 738.4259643554688, 834.7449340820312, 770.4969482421875, 801.0740966796875, 770.4969482421875], "score": 0.9996598958969116}, {"category_id": 0, "poly": [1134.302490234375, 815.6021728515625, 1295.3885498046875, 815.6021728515625, 1295.3885498046875, 844.6544799804688, 1134.302490234375, 844.6544799804688], "score": 0.9994980096817017}, {"category_id": 9, "poly": [798.6090698242188, 1986.7332763671875, 834.5460205078125, 1986.7332763671875, 834.5460205078125, 2017.6595458984375, 798.6090698242188, 2017.6595458984375], "score": 0.9992558360099792}, {"category_id": 0, "poly": [135.0093994140625, 406.12335205078125, 475.6328125, 406.12335205078125, 475.6328125, 437.4545593261719, 135.0093994140625, 437.4545593261719], "score": 0.9990860819816589}, {"category_id": 8, "poly": [1029.3924560546875, 541.857177734375, 1400.174072265625, 541.857177734375, 1400.174072265625, 585.1640625, 1029.3924560546875, 585.1640625], "score": 0.9979717135429382}, {"category_id": 0, "poly": [133.26077270507812, 1330.139892578125, 713.5426635742188, 1330.139892578125, 713.5426635742188, 1363.1341552734375, 133.26077270507812, 1363.1341552734375], "score": 0.9967154860496521}, {"category_id": 8, "poly": [338.6681823730469, 1547.7218017578125, 626.6519775390625, 1547.7218017578125, 626.6519775390625, 1604.587646484375, 338.6681823730469, 1604.587646484375], "score": 0.9945433139801025}, {"category_id": 1, "poly": [864.5469970703125, 160.16702270507812, 1251.313720703125, 160.16702270507812, 1251.313720703125, 190.15760803222656, 864.5469970703125, 190.15760803222656], "score": 0.9902143478393555}, {"category_id": 13, "poly": [550, 577, 648, 577, 648, 612, 550, 612], "score": 0.95, "latex": "C_{a}(p,\\bar{p})"}, {"category_id": 13, "poly": [183, 1780, 304, 1780, 304, 1813, 183, 1813], "score": 0.95, "latex": "p^{\\prime}=m(\\bar{p})"}, {"category_id": 14, "poly": [279, 1000, 687, 1000, 687, 1078, 279, 1078], "score": 0.95, "latex": "w_{t}(p,p_{t-1})=\\exp\\bigg({-\\frac{\\Delta_{c}(p,p_{t-1})}{\\gamma_{t}}}\\bigg),"}, {"category_id": 14, "poly": [147, 1843, 820, 1843, 820, 1992, 147, 1992], "score": 0.94, "latex": "F_{p}=\\left\\{\\begin{array}{l l}{\\underset{\\bar{p}\\in S_{p}\\setminus m(p)}{\\mathrm{min}}\\,C(p,\\bar{p})-\\underset{\\bar{p}\\in S_{p}}{\\mathrm{min}}\\,C(p,\\bar{p})}\\\\ {\\underset{\\bar{p}\\in S_{p}\\setminus m(p)}{\\mathrm{min}}\\,C(p,\\bar{p})}&{|d_{p}-d_{p^{\\prime}}|\\leq1}\\\\ {0,}&{\\mathrm{otherwise}}\\end{array}\\right.."}, {"category_id": 14, "poly": [340, 1546, 628, 1546, 628, 1608, 340, 1608], "score": 0.93, "latex": "m(p)=\\underset{\\bar{p}\\in S_{p}}{\\mathrm{argmin}}\\,C(p,\\bar{p})\\,."}, {"category_id": 13, "poly": [321, 830, 443, 830, 443, 864, 321, 864], "score": 0.93, "latex": "w_{t}(p,p_{t-1})"}, {"category_id": 13, "poly": [581, 1713, 694, 1713, 694, 1747, 581, 1747], "score": 0.93, "latex": "{\\bar{p}}=m(p)"}, {"category_id": 14, "poly": [947, 373, 1478, 373, 1478, 454, 947, 454], "score": 0.93, "latex": "\\Lambda^{i}(p,\\bar{p})=\\alpha\\times\\sum_{q\\in\\Omega_{p}}w(p,q)F_{q}^{i-1}\\left|D_{q}^{i-1}-d_{p}\\right|\\,,"}, {"category_id": 13, "poly": [426, 445, 512, 445, 512, 479, 426, 479], "score": 0.93, "latex": "C(p,{\\bar{p}})"}, {"category_id": 13, "poly": [337, 356, 414, 356, 414, 391, 337, 391], "score": 0.93, "latex": "\\mathcal{O}(\\omega^{2})"}, {"category_id": 13, "poly": [1341, 730, 1565, 730, 1565, 765, 1341, 765], "score": 0.92, "latex": "C_{a}(p,\\bar{p})\\gets C(p,\\bar{p})"}, {"category_id": 13, "poly": [629, 1436, 691, 1436, 691, 1470, 629, 1470], "score": 0.92, "latex": "m(p)"}, {"category_id": 13, "poly": [277, 1469, 361, 1469, 361, 1504, 277, 1504], "score": 0.92, "latex": "\\bar{p}\\in S_{p}"}, {"category_id": 14, "poly": [1030, 541, 1398, 541, 1398, 582, 1030, 582], "score": 0.92, "latex": "C^{i}(p,\\bar{p})=C^{0}(p,\\bar{p})+{\\Lambda^{i}}(p,\\bar{p})\\,,"}, {"category_id": 13, "poly": [453, 356, 518, 356, 518, 391, 453, 391], "score": 0.91, "latex": "\\mathcal{O}(\\omega)"}, {"category_id": 14, "poly": [146, 714, 787, 714, 787, 791, 146, 791], "score": 0.91, "latex": "C(p,\\bar{p})\\gets\\frac{(1-\\lambda)\\cdot C(p,\\bar{p})+\\lambda\\cdot w_{t}(p,p_{t-1})\\cdot C_{a}(p,\\bar{p})}{(1-\\lambda)+\\lambda\\cdot w_{t}(p,p_{t-1})},"}, {"category_id": 13, "poly": [1095, 231, 1134, 231, 1134, 270, 1095, 270], "score": 0.9, "latex": "D_{p}^{i}"}, {"category_id": 13, "poly": [1313, 1752, 1447, 1752, 1447, 1783, 1313, 1783], "score": 0.89, "latex": "640~\\times~480"}, {"category_id": 13, "poly": [593, 1782, 627, 1782, 627, 1815, 593, 1815], "score": 0.89, "latex": "F_{p}"}, {"category_id": 13, "poly": [133, 326, 209, 326, 209, 355, 133, 355], "score": 0.88, "latex": "\\omega\\times\\omega"}, {"category_id": 13, "poly": [208, 1089, 236, 1089, 236, 1116, 208, 1116], "score": 0.85, "latex": "\\gamma_{t}"}, {"category_id": 13, "poly": [1466, 769, 1484, 769, 1484, 797, 1466, 797], "score": 0.83, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [133, 935, 177, 935, 177, 963, 133, 963], "score": 0.83, "latex": "p_{t-1}"}, {"category_id": 13, "poly": [608, 1753, 627, 1753, 627, 1779, 608, 1779], "score": 0.81, "latex": "p"}, {"category_id": 13, "poly": [491, 799, 511, 799, 511, 825, 491, 825], "score": 0.81, "latex": "\\lambda"}, {"category_id": 13, "poly": [1018, 770, 1037, 770, 1037, 796, 1018, 796], "score": 0.81, "latex": "p"}, {"category_id": 13, "poly": [1086, 470, 1107, 470, 1107, 491, 1086, 491], "score": 0.8, "latex": "\\alpha"}, {"category_id": 13, "poly": [466, 901, 485, 901, 485, 929, 466, 929], "score": 0.8, "latex": "p"}, {"category_id": 13, "poly": [208, 484, 227, 484, 227, 511, 208, 511], "score": 0.79, "latex": "p"}, {"category_id": 13, "poly": [462, 1443, 480, 1443, 480, 1468, 462, 1468], "score": 0.77, "latex": "p"}, {"category_id": 13, "poly": [266, 514, 288, 514, 288, 544, 266, 544], "score": 0.77, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [816, 1716, 836, 1716, 836, 1746, 816, 1746], "score": 0.73, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [132, 405, 154, 405, 154, 432, 132, 432], "score": 0.27, "latex": "B"}, {"category_id": 13, "poly": [862, 160, 887, 160, 887, 187, 862, 187], "score": 0.26, "latex": "D"}, {"category_id": 15, "poly": [887.0, 852.0, 1568.0, 855.0, 1568.0, 894.0, 887.0, 891.0], "score": 0.98, "text": " The speed and accuracy of real-time stereo matching al-"}, {"category_id": 15, "poly": [864.0, 891.0, 1566.0, 891.0, 1566.0, 924.0, 864.0, 924.0], "score": 0.99, "text": "gorithms are traditionally demonstrated using still-frame im-"}, {"category_id": 15, "poly": [859.0, 921.0, 1571.0, 919.0, 1571.0, 958.0, 859.0, 960.0], "score": 0.97, "text": " ages from the Middlebury stereo benchmark [1], [2]. Still"}, {"category_id": 15, "poly": [862.0, 956.0, 1568.0, 958.0, 1568.0, 990.0, 862.0, 988.0], "score": 0.99, "text": "frames, however, are insufficient for evaluating stereo match-"}, {"category_id": 15, "poly": [864.0, 992.0, 1571.0, 992.0, 1571.0, 1024.0, 864.0, 1024.0], "score": 1.0, "text": "ing algorithms that incorporate frame-to-frame prediction to"}, {"category_id": 15, "poly": [864.0, 1027.0, 1568.0, 1027.0, 1568.0, 1059.0, 864.0, 1059.0], "score": 0.97, "text": "enhance matching accuracy. An alternative approach is to"}, {"category_id": 15, "poly": [864.0, 1059.0, 1566.0, 1059.0, 1566.0, 1089.0, 864.0, 1089.0], "score": 0.99, "text": "use a stereo video sequence with a ground truth disparity"}, {"category_id": 15, "poly": [862.0, 1091.0, 1566.0, 1091.0, 1566.0, 1123.0, 862.0, 1123.0], "score": 1.0, "text": "for each frame. Obtaining the ground truth disparity of real"}, {"category_id": 15, "poly": [866.0, 1125.0, 1566.0, 1125.0, 1566.0, 1157.0, 866.0, 1157.0], "score": 0.98, "text": "world video sequences is a difficult undertaking due to the"}, {"category_id": 15, "poly": [859.0, 1153.0, 1568.0, 1155.0, 1568.0, 1194.0, 859.0, 1192.0], "score": 0.99, "text": "high frame rate of video and limitations in depth sensing-"}, {"category_id": 15, "poly": [864.0, 1192.0, 1568.0, 1192.0, 1568.0, 1224.0, 864.0, 1224.0], "score": 0.99, "text": "technology. To address the need for stereo video with ground"}, {"category_id": 15, "poly": [864.0, 1224.0, 1568.0, 1224.0, 1568.0, 1256.0, 864.0, 1256.0], "score": 0.99, "text": "truth disparities, five pairs of synthetic stereo video sequences"}, {"category_id": 15, "poly": [864.0, 1258.0, 1568.0, 1258.0, 1568.0, 1290.0, 864.0, 1290.0], "score": 0.99, "text": "of a computer-generated scene were given in [19]. While these"}, {"category_id": 15, "poly": [864.0, 1290.0, 1566.0, 1290.0, 1566.0, 1322.0, 864.0, 1322.0], "score": 1.0, "text": "videos incorporate a sufficient amount of movement variation,"}, {"category_id": 15, "poly": [862.0, 1325.0, 1568.0, 1325.0, 1568.0, 1357.0, 862.0, 1357.0], "score": 0.99, "text": "they were generated from relatively simple models using low-"}, {"category_id": 15, "poly": [862.0, 1359.0, 1571.0, 1359.0, 1571.0, 1389.0, 862.0, 1389.0], "score": 0.99, "text": "resolution rendering, and they do not provide occlusion or"}, {"category_id": 15, "poly": [862.0, 1386.0, 1088.0, 1394.0, 1087.0, 1426.0, 861.0, 1418.0], "score": 0.98, "text": "discontinuity maps."}, {"category_id": 15, "poly": [129.0, 156.0, 839.0, 158.0, 838.0, 197.0, 129.0, 195.0], "score": 0.99, "text": "the matching cost by performing two-pass aggregation using"}, {"category_id": 15, "poly": [130.0, 188.0, 841.0, 193.0, 841.0, 229.0, 129.0, 225.0], "score": 0.98, "text": "two orthogonal 1D windows [5], [6], [8]. The two-pass method "}, {"category_id": 15, "poly": [129.0, 225.0, 841.0, 222.0, 841.0, 261.0, 129.0, 264.0], "score": 0.99, "text": "first aggregates matching costs in the vertical direction, and"}, {"category_id": 15, "poly": [134.0, 261.0, 838.0, 261.0, 838.0, 293.0, 134.0, 293.0], "score": 0.99, "text": "then computes a weighted sum of the aggregated costs in the"}, {"category_id": 15, "poly": [132.0, 291.0, 838.0, 291.0, 838.0, 330.0, 132.0, 330.0], "score": 0.99, "text": "horizontal direction. Given that support regions are of size"}, {"category_id": 15, "poly": [136.0, 360.0, 336.0, 360.0, 336.0, 392.0, 136.0, 392.0], "score": 0.99, "text": "aggregation from"}, {"category_id": 15, "poly": [415.0, 360.0, 452.0, 360.0, 452.0, 392.0, 415.0, 392.0], "score": 0.98, "text": "to"}, {"category_id": 15, "poly": [210.0, 321.0, 836.0, 321.0, 836.0, 360.0, 210.0, 360.0], "score": 0.98, "text": ", the two-pass method reduces the complexity of cost"}, {"category_id": 15, "poly": [887.0, 1416.0, 1571.0, 1419.0, 1571.0, 1458.0, 887.0, 1455.0], "score": 0.98, "text": " To evaluate the performance of temporal aggregation, a"}, {"category_id": 15, "poly": [862.0, 1453.0, 1566.0, 1453.0, 1566.0, 1485.0, 862.0, 1485.0], "score": 0.98, "text": "new synthetic stereo video sequence is introduced along with"}, {"category_id": 15, "poly": [862.0, 1490.0, 1566.0, 1487.0, 1566.0, 1519.0, 862.0, 1522.0], "score": 0.99, "text": "corresponding disparity maps, occlusion maps, and disconti-"}, {"category_id": 15, "poly": [862.0, 1519.0, 1571.0, 1519.0, 1571.0, 1558.0, 862.0, 1558.0], "score": 0.99, "text": "nuity maps for evaluating the performance of temporal stereo"}, {"category_id": 15, "poly": [864.0, 1556.0, 1568.0, 1556.0, 1568.0, 1588.0, 864.0, 1588.0], "score": 1.0, "text": "matching algorithms. To create the video sequence, a complex"}, {"category_id": 15, "poly": [864.0, 1590.0, 1568.0, 1590.0, 1568.0, 1620.0, 864.0, 1620.0], "score": 0.99, "text": "scene was constructed using Google Sketchup and a pair"}, {"category_id": 15, "poly": [864.0, 1622.0, 1568.0, 1622.0, 1568.0, 1655.0, 864.0, 1655.0], "score": 0.99, "text": "of animated paths were rendered photorealistically using the"}, {"category_id": 15, "poly": [859.0, 1650.0, 1571.0, 1652.0, 1571.0, 1691.0, 859.0, 1689.0], "score": 0.99, "text": " Kerkythea rendering software. Realistic material properties"}, {"category_id": 15, "poly": [864.0, 1689.0, 1566.0, 1689.0, 1566.0, 1721.0, 864.0, 1721.0], "score": 1.0, "text": "were used to give surfaces a natural-looking appearance by"}, {"category_id": 15, "poly": [864.0, 1723.0, 1566.0, 1723.0, 1566.0, 1755.0, 864.0, 1755.0], "score": 0.98, "text": "adjusting their specularity, reflectance, and diffusion. The"}, {"category_id": 15, "poly": [864.0, 1788.0, 1568.0, 1788.0, 1568.0, 1820.0, 864.0, 1820.0], "score": 1.0, "text": "frame rate of 30 frames per second, and a duration of 4"}, {"category_id": 15, "poly": [862.0, 1817.0, 1568.0, 1820.0, 1568.0, 1859.0, 861.0, 1856.0], "score": 0.98, "text": "seconds. In addition to performing photorealistic rendering."}, {"category_id": 15, "poly": [864.0, 1856.0, 1568.0, 1856.0, 1568.0, 1888.0, 864.0, 1888.0], "score": 0.99, "text": "depth renders of both video sequences were also generated and"}, {"category_id": 15, "poly": [864.0, 1888.0, 1566.0, 1888.0, 1566.0, 1920.0, 864.0, 1920.0], "score": 0.98, "text": "converted to ground truth disparity for the stereo video. The"}, {"category_id": 15, "poly": [862.0, 1920.0, 1564.0, 1920.0, 1564.0, 1952.0, 862.0, 1952.0], "score": 0.99, "text": "video sequences and ground truth data have been made avail-"}, {"category_id": 15, "poly": [862.0, 1950.0, 1566.0, 1953.0, 1566.0, 1985.0, 862.0, 1982.0], "score": 0.99, "text": "able at http://mc2.unl.edu/current-research"}, {"category_id": 15, "poly": [866.0, 1989.0, 1566.0, 1989.0, 1566.0, 2019.0, 866.0, 2019.0], "score": 0.98, "text": "/ image-processing/. Figure 2 shows two sample frames"}, {"category_id": 15, "poly": [862.0, 1755.0, 1312.0, 1755.0, 1312.0, 1788.0, 862.0, 1788.0], "score": 0.97, "text": "video sequence has a resolution of "}, {"category_id": 15, "poly": [1448.0, 1755.0, 1566.0, 1755.0, 1566.0, 1788.0, 1448.0, 1788.0], "score": 0.99, "text": "pixels,a"}, {"category_id": 15, "poly": [889.0, 197.0, 1566.0, 199.0, 1566.0, 238.0, 889.0, 236.0], "score": 1.0, "text": "Once the first iteration of stereo matching is complete,"}, {"category_id": 15, "poly": [864.0, 268.0, 1566.0, 268.0, 1566.0, 300.0, 864.0, 300.0], "score": 0.99, "text": "subsequent iterations. This is done by penalizing disparities"}, {"category_id": 15, "poly": [864.0, 302.0, 1568.0, 302.0, 1568.0, 335.0, 864.0, 335.0], "score": 1.0, "text": "that deviate from their expected values. The penalty function"}, {"category_id": 15, "poly": [862.0, 337.0, 996.0, 337.0, 996.0, 369.0, 862.0, 369.0], "score": 0.97, "text": "is given by"}, {"category_id": 15, "poly": [864.0, 236.0, 1094.0, 236.0, 1094.0, 268.0, 864.0, 268.0], "score": 0.96, "text": "disparityestimates"}, {"category_id": 15, "poly": [1135.0, 236.0, 1568.0, 236.0, 1568.0, 268.0, 1135.0, 268.0], "score": 0.97, "text": " can be used to guide matching in"}, {"category_id": 15, "poly": [157.0, 1366.0, 839.0, 1368.0, 838.0, 1407.0, 157.0, 1405.0], "score": 1.0, "text": "Having performed temporal cost aggregation, matches are"}, {"category_id": 15, "poly": [134.0, 1405.0, 834.0, 1405.0, 834.0, 1437.0, 134.0, 1437.0], "score": 0.99, "text": "determined using the Winner-Takes-All (WTA) match selec-"}, {"category_id": 15, "poly": [132.0, 1506.0, 374.0, 1506.0, 374.0, 1538.0, 132.0, 1538.0], "score": 1.0, "text": "cost, and is given by"}, {"category_id": 15, "poly": [692.0, 1439.0, 834.0, 1439.0, 834.0, 1471.0, 692.0, 1471.0], "score": 0.99, "text": ", is the can-"}, {"category_id": 15, "poly": [134.0, 1474.0, 276.0, 1474.0, 276.0, 1506.0, 134.0, 1506.0], "score": 0.98, "text": "didate pixel"}, {"category_id": 15, "poly": [362.0, 1474.0, 836.0, 1474.0, 836.0, 1506.0, 362.0, 1506.0], "score": 0.99, "text": " characterized by the minimum matching"}, {"category_id": 15, "poly": [134.0, 1439.0, 461.0, 1439.0, 461.0, 1471.0, 134.0, 1471.0], "score": 1.0, "text": "tion criteria. The match for"}, {"category_id": 15, "poly": [481.0, 1439.0, 628.0, 1439.0, 628.0, 1471.0, 481.0, 1471.0], "score": 0.96, "text": ", denoted as"}, {"category_id": 15, "poly": [134.0, 548.0, 838.0, 545.0, 838.0, 577.0, 134.0, 580.0], "score": 0.99, "text": "aggregation routine is exectuted. At each time instance, the"}, {"category_id": 15, "poly": [134.0, 614.0, 834.0, 614.0, 834.0, 646.0, 134.0, 646.0], "score": 1.0, "text": "weighted summation of costs obtained in the previous frames."}, {"category_id": 15, "poly": [132.0, 646.0, 838.0, 644.0, 838.0, 676.0, 132.0, 678.0], "score": 1.0, "text": "During temporal aggregation, the auxiliary cost is merged with"}, {"category_id": 15, "poly": [132.0, 678.0, 675.0, 681.0, 674.0, 713.0, 132.0, 710.0], "score": 0.99, "text": "the cost obtained from the current frame using"}, {"category_id": 15, "poly": [134.0, 580.0, 549.0, 580.0, 549.0, 612.0, 134.0, 612.0], "score": 1.0, "text": "algorithm stores an auxiliary cost"}, {"category_id": 15, "poly": [649.0, 580.0, 841.0, 580.0, 841.0, 612.0, 649.0, 612.0], "score": 0.96, "text": "which holds a"}, {"category_id": 15, "poly": [157.0, 445.0, 425.0, 442.0, 425.0, 481.0, 157.0, 484.0], "score": 0.98, "text": " Once aggregated costs"}, {"category_id": 15, "poly": [513.0, 445.0, 838.0, 442.0, 838.0, 481.0, 513.0, 484.0], "score": 0.96, "text": " have been computed for all"}, {"category_id": 15, "poly": [132.0, 481.0, 207.0, 481.0, 207.0, 513.0, 132.0, 513.0], "score": 1.0, "text": "pixels"}, {"category_id": 15, "poly": [228.0, 481.0, 838.0, 481.0, 838.0, 513.0, 228.0, 513.0], "score": 0.97, "text": " in the reference image and their respective matching"}, {"category_id": 15, "poly": [134.0, 516.0, 265.0, 516.0, 265.0, 548.0, 134.0, 548.0], "score": 1.0, "text": "candidates"}, {"category_id": 15, "poly": [289.0, 516.0, 838.0, 516.0, 838.0, 548.0, 289.0, 548.0], "score": 0.98, "text": " in the target image, a single-pass temporal"}, {"category_id": 15, "poly": [132.0, 1116.0, 841.0, 1116.0, 841.0, 1155.0, 132.0, 1155.0], "score": 0.99, "text": "in the temporal dimension. The temporal adaptive weight has "}, {"category_id": 15, "poly": [134.0, 1153.0, 838.0, 1153.0, 838.0, 1185.0, 134.0, 1185.0], "score": 0.99, "text": "the effect of preserving edges in the temporal domain, such"}, {"category_id": 15, "poly": [132.0, 1182.0, 836.0, 1182.0, 836.0, 1215.0, 132.0, 1215.0], "score": 0.98, "text": "that when a pixel coordinate transitions from one side of an"}, {"category_id": 15, "poly": [134.0, 1219.0, 838.0, 1219.0, 838.0, 1251.0, 134.0, 1251.0], "score": 0.98, "text": "edge to another in subsequent frames, the auxiliary cost is"}, {"category_id": 15, "poly": [134.0, 1254.0, 838.0, 1254.0, 838.0, 1283.0, 134.0, 1283.0], "score": 0.99, "text": "assigned a small weight and the majority of the cost is derived"}, {"category_id": 15, "poly": [130.0, 1283.0, 404.0, 1286.0, 404.0, 1318.0, 129.0, 1315.0], "score": 1.0, "text": "from the current frame."}, {"category_id": 15, "poly": [134.0, 1086.0, 207.0, 1086.0, 207.0, 1118.0, 134.0, 1118.0], "score": 0.99, "text": "where"}, {"category_id": 15, "poly": [237.0, 1086.0, 836.0, 1086.0, 836.0, 1118.0, 237.0, 1118.0], "score": 0.99, "text": "regulates the strength of grouping by color similarity"}, {"category_id": 15, "poly": [864.0, 600.0, 1568.0, 600.0, 1568.0, 632.0, 864.0, 632.0], "score": 1.0, "text": "and the matches are reselected using the WTA match selection"}, {"category_id": 15, "poly": [864.0, 635.0, 1568.0, 635.0, 1568.0, 667.0, 864.0, 667.0], "score": 0.99, "text": "criteria. The resulting disparity maps are then post-processed"}, {"category_id": 15, "poly": [864.0, 669.0, 1564.0, 669.0, 1564.0, 699.0, 864.0, 699.0], "score": 0.98, "text": "using a combination of median filtering and occlusion filling."}, {"category_id": 15, "poly": [864.0, 701.0, 1566.0, 701.0, 1566.0, 731.0, 864.0, 731.0], "score": 0.98, "text": "Finally, the current cost becomes the auxiliary cost for the next"}, {"category_id": 15, "poly": [862.0, 731.0, 1340.0, 731.0, 1340.0, 770.0, 862.0, 770.0], "score": 0.99, "text": "pair of frames in the video sequence, i.e.,"}, {"category_id": 15, "poly": [864.0, 768.0, 1017.0, 768.0, 1017.0, 800.0, 864.0, 800.0], "score": 1.0, "text": "for all pixels"}, {"category_id": 15, "poly": [1038.0, 768.0, 1465.0, 768.0, 1465.0, 800.0, 1038.0, 800.0], "score": 0.98, "text": " in the and their matching candidates"}, {"category_id": 15, "poly": [864.0, 502.0, 1427.0, 502.0, 1427.0, 532.0, 864.0, 532.0], "score": 1.0, "text": "values are incorporated into the matching cost as"}, {"category_id": 15, "poly": [864.0, 468.0, 1085.0, 468.0, 1085.0, 500.0, 864.0, 500.0], "score": 0.96, "text": "where the value of"}, {"category_id": 15, "poly": [1108.0, 468.0, 1564.0, 468.0, 1564.0, 500.0, 1108.0, 500.0], "score": 0.99, "text": "is chosen empirically. Next, the penalty"}, {"category_id": 15, "poly": [134.0, 866.0, 838.0, 866.0, 838.0, 898.0, 134.0, 898.0], "score": 0.99, "text": "temporal domain. The temporal adaptive weight computed"}, {"category_id": 15, "poly": [132.0, 967.0, 263.0, 967.0, 263.0, 999.0, 132.0, 999.0], "score": 0.93, "text": "is given by"}, {"category_id": 15, "poly": [134.0, 834.0, 320.0, 834.0, 320.0, 866.0, 134.0, 866.0], "score": 0.97, "text": "smoothing and"}, {"category_id": 15, "poly": [444.0, 834.0, 836.0, 834.0, 836.0, 866.0, 444.0, 866.0], "score": 0.92, "text": " enforces color similarity in the"}, {"category_id": 15, "poly": [178.0, 930.0, 838.0, 928.0, 839.0, 967.0, 178.0, 969.0], "score": 0.99, "text": ", located at the same spatial coordinate in the prior frame,"}, {"category_id": 15, "poly": [132.0, 795.0, 490.0, 800.0, 490.0, 832.0, 132.0, 827.0], "score": 0.99, "text": "where the feedback coefficient"}, {"category_id": 15, "poly": [512.0, 795.0, 836.0, 800.0, 836.0, 832.0, 512.0, 827.0], "score": 0.97, "text": " controls the amount of cost"}, {"category_id": 15, "poly": [136.0, 898.0, 465.0, 898.0, 465.0, 930.0, 136.0, 930.0], "score": 0.99, "text": "between the pixel of interest"}, {"category_id": 15, "poly": [486.0, 898.0, 838.0, 898.0, 838.0, 930.0, 486.0, 930.0], "score": 1.0, "text": "in the current frame and pixel"}, {"category_id": 15, "poly": [159.0, 1616.0, 836.0, 1616.0, 836.0, 1648.0, 159.0, 1648.0], "score": 0.99, "text": "To asses the level of confidence associated with selecting"}, {"category_id": 15, "poly": [132.0, 1648.0, 836.0, 1650.0, 836.0, 1682.0, 132.0, 1680.0], "score": 1.0, "text": "minimum cost matches, the algorithm determines another set"}, {"category_id": 15, "poly": [134.0, 1684.0, 838.0, 1684.0, 838.0, 1716.0, 134.0, 1716.0], "score": 1.0, "text": "of matches, this time from the target to reference image, and"}, {"category_id": 15, "poly": [134.0, 1783.0, 182.0, 1783.0, 182.0, 1815.0, 134.0, 1815.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [136.0, 1714.0, 580.0, 1714.0, 580.0, 1746.0, 136.0, 1746.0], "score": 0.98, "text": "verifies if the results agree. Given that"}, {"category_id": 15, "poly": [305.0, 1783.0, 592.0, 1783.0, 592.0, 1815.0, 305.0, 1815.0], "score": 0.99, "text": ", the confidence measure"}, {"category_id": 15, "poly": [628.0, 1783.0, 811.0, 1783.0, 811.0, 1815.0, 628.0, 1815.0], "score": 0.97, "text": "is computed as"}, {"category_id": 15, "poly": [132.0, 1746.0, 607.0, 1751.0, 607.0, 1783.0, 132.0, 1778.0], "score": 1.0, "text": "in the right image is the match for pixel"}, {"category_id": 15, "poly": [628.0, 1746.0, 836.0, 1751.0, 836.0, 1783.0, 628.0, 1778.0], "score": 0.98, "text": "in the left image,"}, {"category_id": 15, "poly": [695.0, 1714.0, 815.0, 1714.0, 815.0, 1746.0, 695.0, 1746.0], "score": 0.99, "text": ", i.e. pixel"}, {"category_id": 15, "poly": [1132.0, 814.0, 1298.0, 814.0, 1298.0, 852.0, 1132.0, 852.0], "score": 1.0, "text": "IV. RESULTS"}, {"category_id": 15, "poly": [155.0, 401.0, 481.0, 406.0, 480.0, 445.0, 155.0, 440.0], "score": 0.99, "text": "Temporal cost aggregation"}, {"category_id": 15, "poly": [129.0, 1325.0, 718.0, 1327.0, 718.0, 1366.0, 129.0, 1363.0], "score": 0.99, "text": "C. Disparity Selection and Confidence Assessment"}, {"category_id": 15, "poly": [888.0, 158.0, 1252.0, 158.0, 1252.0, 197.0, 888.0, 197.0], "score": 0.97, "text": "Iterative Disparity Refinement"}], "page_info": {"page_no": 2, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 1, "poly": [133.2669677734375, 156.7020721435547, 840.6729125976562, 156.7020721435547, 840.6729125976562, 257.75836181640625, 133.2669677734375, 257.75836181640625], "score": 0.9999951124191284}, {"category_id": 3, "poly": [866.177734375, 171.2958526611328, 1510.944580078125, 171.2958526611328, 1510.944580078125, 848.8190307617188, 866.177734375, 848.8190307617188], "score": 0.9999942779541016}, {"category_id": 1, "poly": [131.3756561279297, 1520.5887451171875, 838.545166015625, 1520.5887451171875, 838.545166015625, 1885.353515625, 131.3756561279297, 1885.353515625], "score": 0.9999925494194031}, {"category_id": 4, "poly": [131.56919860839844, 1352.6187744140625, 840.1758422851562, 1352.6187744140625, 840.1758422851562, 1490.513671875, 131.56919860839844, 1490.513671875], "score": 0.9999915361404419}, {"category_id": 1, "poly": [132.41786193847656, 1886.0615234375, 838.675537109375, 1886.0615234375, 838.675537109375, 2019.347412109375, 132.41786193847656, 2019.347412109375], "score": 0.9999526739120483}, {"category_id": 3, "poly": [136.71240234375, 278.259765625, 816.1984252929688, 278.259765625, 816.1984252929688, 1348.5758056640625, 136.71240234375, 1348.5758056640625], "score": 0.9999439120292664}, {"category_id": 1, "poly": [863.4852905273438, 1917.056884765625, 1569.6337890625, 1917.056884765625, 1569.6337890625, 2020.57421875, 863.4852905273438, 2020.57421875], "score": 0.9999344348907471}, {"category_id": 4, "poly": [861.7813720703125, 1749.4459228515625, 1567.659912109375, 1749.4459228515625, 1567.659912109375, 1852.389892578125, 861.7813720703125, 1852.389892578125], "score": 0.9986151456832886}, {"category_id": 3, "poly": [874.6467895507812, 1536.7642822265625, 1506.6514892578125, 1536.7642822265625, 1506.6514892578125, 1734.9659423828125, 874.6467895507812, 1734.9659423828125], "score": 0.9940656423568726}, {"category_id": 4, "poly": [859.3250122070312, 861.2320556640625, 1569.650634765625, 861.2320556640625, 1569.650634765625, 1033.0804443359375, 859.3250122070312, 1033.0804443359375], "score": 0.985899806022644}, {"category_id": 1, "poly": [861.6172485351562, 1064.186279296875, 1564.036865234375, 1064.186279296875, 1564.036865234375, 1135.5125732421875, 861.6172485351562, 1135.5125732421875], "score": 0.9128350019454956}, {"category_id": 3, "poly": [888.8074340820312, 1163.7965087890625, 1529.8028564453125, 1163.7965087890625, 1529.8028564453125, 1510.91162109375, 888.8074340820312, 1510.91162109375], "score": 0.7896175384521484}, {"category_id": 5, "poly": [900.75146484375, 1161.0631103515625, 1527.15673828125, 1161.0631103515625, 1527.15673828125, 1490.2149658203125, 900.75146484375, 1490.2149658203125], "score": 0.7772396802902222}, {"category_id": 0, "poly": [1178.85791015625, 152.25347900390625, 1284.6339111328125, 152.25347900390625, 1284.6339111328125, 179.1011962890625, 1178.85791015625, 179.1011962890625], "score": 0.5732811689376831}, {"category_id": 4, "poly": [1178.981689453125, 152.21678161621094, 1284.4158935546875, 152.21678161621094, 1284.4158935546875, 179.05447387695312, 1178.981689453125, 179.05447387695312], "score": 0.4503781795501709}, {"category_id": 13, "poly": [1295, 896, 1483, 896, 1483, 931, 1295, 931], "score": 0.93, "latex": "\\{\\pm0,\\pm20,\\pm40\\}"}, {"category_id": 13, "poly": [481, 1919, 534, 1919, 534, 1949, 481, 1949], "score": 0.87, "latex": "\\pm20"}, {"category_id": 13, "poly": [591, 1919, 644, 1919, 644, 1949, 591, 1949], "score": 0.87, "latex": "\\pm40"}, {"category_id": 13, "poly": [1227, 1436, 1253, 1436, 1253, 1459, 1227, 1459], "score": 0.86, "latex": "\\gamma_{c}"}, {"category_id": 13, "poly": [1295, 1436, 1323, 1436, 1323, 1461, 1295, 1461], "score": 0.85, "latex": "\\gamma_{g}"}, {"category_id": 13, "poly": [133, 1588, 186, 1588, 186, 1618, 133, 1618], "score": 0.85, "latex": "\\pm20"}, {"category_id": 13, "poly": [249, 1587, 302, 1587, 302, 1618, 249, 1618], "score": 0.84, "latex": "\\pm40"}, {"category_id": 13, "poly": [787, 1555, 828, 1555, 828, 1585, 787, 1585], "score": 0.82, "latex": "\\pm0"}, {"category_id": 13, "poly": [532, 1421, 572, 1421, 572, 1452, 532, 1452], "score": 0.81, "latex": "3^{\\mathrm{rd}}"}, {"category_id": 13, "poly": [230, 1389, 266, 1389, 266, 1419, 230, 1419], "score": 0.8, "latex": "1^{\\mathrm{st}}"}, {"category_id": 13, "poly": [655, 1986, 675, 1986, 675, 2013, 655, 2013], "score": 0.78, "latex": "\\lambda"}, {"category_id": 13, "poly": [200, 1455, 240, 1455, 240, 1486, 200, 1486], "score": 0.75, "latex": "4^{\\mathrm{th}}"}, {"category_id": 13, "poly": [954, 1255, 980, 1255, 980, 1275, 954, 1275], "score": 0.75, "latex": "\\gamma_{c}"}, {"category_id": 13, "poly": [954, 1281, 980, 1281, 980, 1302, 954, 1302], "score": 0.74, "latex": "\\gamma_{g}"}, {"category_id": 13, "poly": [959, 1227, 976, 1227, 976, 1245, 959, 1245], "score": 0.74, "latex": "\\tau"}, {"category_id": 13, "poly": [960, 1352, 976, 1352, 976, 1372, 960, 1372], "score": 0.72, "latex": "k"}, {"category_id": 13, "poly": [410, 1986, 430, 1986, 430, 2013, 410, 2013], "score": 0.7, "latex": "\\lambda"}, {"category_id": 13, "poly": [955, 1331, 979, 1331, 979, 1351, 955, 1351], "score": 0.7, "latex": "\\gamma_{t}"}, {"category_id": 13, "poly": [1489, 1752, 1510, 1752, 1510, 1778, 1489, 1778], "score": 0.69, "latex": "\\lambda"}, {"category_id": 13, "poly": [1176, 965, 1195, 965, 1195, 992, 1176, 992], "score": 0.69, "latex": "\\lambda"}, {"category_id": 13, "poly": [246, 1421, 289, 1421, 289, 1452, 246, 1452], "score": 0.69, "latex": "2^{\\mathrm{nd}}"}, {"category_id": 13, "poly": [958, 1302, 977, 1302, 977, 1323, 958, 1323], "score": 0.63, "latex": "\\lambda"}, {"category_id": 13, "poly": [959, 1380, 977, 1380, 977, 1397, 959, 1397], "score": 0.58, "latex": "\\alpha"}, {"category_id": 13, "poly": [436, 1621, 455, 1621, 455, 1648, 436, 1648], "score": 0.58, "latex": "\\lambda"}, {"category_id": 13, "poly": [959, 1204, 977, 1204, 977, 1219, 959, 1219], "score": 0.42, "latex": "\\omega"}, {"category_id": 13, "poly": [870, 1592, 890, 1592, 890, 1617, 870, 1617], "score": 0.31, "latex": "\\lambda"}, {"category_id": 15, "poly": [134.0, 160.0, 836.0, 160.0, 836.0, 192.0, 134.0, 192.0], "score": 0.99, "text": "of the synthetic stereo scene from a single camera perspective,"}, {"category_id": 15, "poly": [134.0, 195.0, 838.0, 195.0, 838.0, 227.0, 134.0, 227.0], "score": 0.99, "text": "along with the ground truth disparity, occlusion map, and"}, {"category_id": 15, "poly": [130.0, 222.0, 347.0, 230.0, 346.0, 264.0, 129.0, 256.0], "score": 0.99, "text": "discontinuity map."}, {"category_id": 15, "poly": [155.0, 1517.0, 841.0, 1519.0, 841.0, 1558.0, 155.0, 1556.0], "score": 0.99, "text": " The results of temporal stereo matching are given in Figure"}, {"category_id": 15, "poly": [132.0, 1657.0, 838.0, 1657.0, 838.0, 1689.0, 132.0, 1689.0], "score": 0.99, "text": "stereo matching methods, improvements are negligible when"}, {"category_id": 15, "poly": [132.0, 1691.0, 838.0, 1691.0, 838.0, 1723.0, 132.0, 1723.0], "score": 0.99, "text": "no noise is added to the images [10], [19]. This is largely due"}, {"category_id": 15, "poly": [132.0, 1723.0, 836.0, 1723.0, 836.0, 1753.0, 132.0, 1753.0], "score": 0.98, "text": "to the fact that the video used to evaluate these methods is"}, {"category_id": 15, "poly": [129.0, 1753.0, 838.0, 1751.0, 839.0, 1790.0, 129.0, 1792.0], "score": 0.99, "text": " computer generated with very little noise to start with, thus"}, {"category_id": 15, "poly": [134.0, 1790.0, 836.0, 1790.0, 836.0, 1822.0, 134.0, 1822.0], "score": 0.99, "text": "the noise suppression achieved with temporal stereo matching"}, {"category_id": 15, "poly": [132.0, 1817.0, 839.0, 1822.0, 838.0, 1859.0, 132.0, 1854.0], "score": 0.99, "text": "shows little to no improvement over methods that operate on"}, {"category_id": 15, "poly": [130.0, 1856.0, 319.0, 1859.0, 318.0, 1891.0, 129.0, 1888.0], "score": 0.99, "text": "pairs of images."}, {"category_id": 15, "poly": [187.0, 1590.0, 248.0, 1590.0, 248.0, 1622.0, 187.0, 1622.0], "score": 0.87, "text": ",and"}, {"category_id": 15, "poly": [303.0, 1590.0, 838.0, 1590.0, 838.0, 1622.0, 303.0, 1622.0], "score": 0.98, "text": ". Each performance plot is given as a function"}, {"category_id": 15, "poly": [127.0, 1551.0, 786.0, 1554.0, 786.0, 1593.0, 127.0, 1590.0], "score": 0.98, "text": " 3 for uniform additive noise confined to the ranges of"}, {"category_id": 15, "poly": [134.0, 1622.0, 435.0, 1622.0, 435.0, 1655.0, 134.0, 1655.0], "score": 0.99, "text": "of the feedback coefficient"}, {"category_id": 15, "poly": [456.0, 1622.0, 836.0, 1622.0, 836.0, 1655.0, 456.0, 1655.0], "score": 0.97, "text": ". As with the majority of temporal"}, {"category_id": 15, "poly": [134.0, 1359.0, 834.0, 1359.0, 834.0, 1391.0, 134.0, 1391.0], "score": 0.99, "text": "Figure 2: Two sample frames from the synthetic video se-"}, {"category_id": 15, "poly": [573.0, 1418.0, 836.0, 1421.0, 836.0, 1460.0, 573.0, 1457.0], "score": 1.0, "text": "row), and discontinuity"}, {"category_id": 15, "poly": [134.0, 1393.0, 229.0, 1393.0, 229.0, 1425.0, 134.0, 1425.0], "score": 0.96, "text": "quence ("}, {"category_id": 15, "poly": [267.0, 1393.0, 836.0, 1393.0, 836.0, 1425.0, 267.0, 1425.0], "score": 0.98, "text": "row), along with their corresponding ground truth"}, {"category_id": 15, "poly": [127.0, 1456.0, 199.0, 1450.0, 199.0, 1489.0, 128.0, 1495.0], "score": 0.91, "text": "map ("}, {"category_id": 15, "poly": [241.0, 1456.0, 309.0, 1450.0, 310.0, 1489.0, 241.0, 1495.0], "score": 1.0, "text": "row)."}, {"category_id": 15, "poly": [129.0, 1418.0, 245.0, 1421.0, 245.0, 1460.0, 129.0, 1457.0], "score": 0.93, "text": " disparity "}, {"category_id": 15, "poly": [290.0, 1418.0, 531.0, 1421.0, 531.0, 1460.0, 290.0, 1457.0], "score": 1.0, "text": "row), occlusion map ("}, {"category_id": 15, "poly": [159.0, 1888.0, 836.0, 1888.0, 836.0, 1920.0, 159.0, 1920.0], "score": 0.99, "text": " Significant improvements in accuracy can be seen in Figure"}, {"category_id": 15, "poly": [132.0, 1950.0, 839.0, 1955.0, 838.0, 1987.0, 132.0, 1982.0], "score": 1.0, "text": "the effect of noise in the current frame is reduced by increasing"}, {"category_id": 15, "poly": [134.0, 1920.0, 480.0, 1920.0, 480.0, 1952.0, 134.0, 1952.0], "score": 0.99, "text": "3 when the noise has ranges of"}, {"category_id": 15, "poly": [535.0, 1920.0, 590.0, 1920.0, 590.0, 1952.0, 535.0, 1952.0], "score": 0.92, "text": " and"}, {"category_id": 15, "poly": [645.0, 1920.0, 836.0, 1920.0, 836.0, 1952.0, 645.0, 1952.0], "score": 0.96, "text": ". In this scenario,"}, {"category_id": 15, "poly": [676.0, 1989.0, 838.0, 1989.0, 838.0, 2019.0, 676.0, 2019.0], "score": 0.98, "text": "has the effect"}, {"category_id": 15, "poly": [134.0, 1989.0, 409.0, 1989.0, 409.0, 2019.0, 134.0, 2019.0], "score": 1.0, "text": "the feedback coefficient"}, {"category_id": 15, "poly": [431.0, 1989.0, 654.0, 1989.0, 654.0, 2019.0, 431.0, 2019.0], "score": 0.97, "text": ". This increasing of"}, {"category_id": 15, "poly": [864.0, 1920.0, 1566.0, 1920.0, 1566.0, 1952.0, 864.0, 1952.0], "score": 0.98, "text": "of averaging out noise in the per-pixel costs by selecting"}, {"category_id": 15, "poly": [861.0, 1950.0, 1566.0, 1948.0, 1566.0, 1987.0, 862.0, 1989.0], "score": 0.98, "text": "matches based more heavily upon the auxiliary cost, which"}, {"category_id": 15, "poly": [862.0, 1989.0, 1568.0, 1989.0, 1568.0, 2021.0, 862.0, 2021.0], "score": 0.99, "text": "is essentially a much more stable running average of the cost"}, {"category_id": 15, "poly": [864.0, 1788.0, 1564.0, 1785.0, 1564.0, 1817.0, 864.0, 1820.0], "score": 0.99, "text": "responding to the smallest mean squared error (MSE) of the"}, {"category_id": 15, "poly": [864.0, 1822.0, 1427.0, 1822.0, 1427.0, 1854.0, 864.0, 1854.0], "score": 0.99, "text": "disparity estimates for a range of noise strengths."}, {"category_id": 15, "poly": [862.0, 1748.0, 1488.0, 1753.0, 1488.0, 1785.0, 861.0, 1781.0], "score": 0.99, "text": "Figure 4: Optimal values of the feedback coefficient "}, {"category_id": 15, "poly": [1511.0, 1748.0, 1561.0, 1753.0, 1561.0, 1785.0, 1511.0, 1781.0], "score": 0.96, "text": "cor-"}, {"category_id": 15, "poly": [864.0, 866.0, 1566.0, 866.0, 1566.0, 898.0, 864.0, 898.0], "score": 0.99, "text": "Figure 3: Performance of temporal matching at different levels"}, {"category_id": 15, "poly": [864.0, 935.0, 1566.0, 933.0, 1566.0, 965.0, 864.0, 967.0], "score": 0.98, "text": "squared error (MSE) of disparities is plotted versus the values"}, {"category_id": 15, "poly": [864.0, 1001.0, 1492.0, 1001.0, 1492.0, 1031.0, 864.0, 1031.0], "score": 0.99, "text": "values of MSE obtained without temporal aggregation."}, {"category_id": 15, "poly": [864.0, 901.0, 1294.0, 901.0, 1294.0, 933.0, 864.0, 933.0], "score": 0.99, "text": "of uniformly distributed image noise"}, {"category_id": 15, "poly": [1484.0, 901.0, 1568.0, 901.0, 1568.0, 933.0, 1484.0, 933.0], "score": 0.99, "text": ".Mean"}, {"category_id": 15, "poly": [864.0, 967.0, 1175.0, 967.0, 1175.0, 999.0, 864.0, 999.0], "score": 0.99, "text": "of the feedback coefficient"}, {"category_id": 15, "poly": [1196.0, 967.0, 1568.0, 967.0, 1568.0, 999.0, 1196.0, 999.0], "score": 0.99, "text": ". Dashed lines correspond to the"}, {"category_id": 15, "poly": [857.0, 1061.0, 1566.0, 1068.0, 1566.0, 1107.0, 857.0, 1100.0], "score": 0.99, "text": " Table I: Parameters used in the evaluation of real-time tempo-"}, {"category_id": 15, "poly": [859.0, 1102.0, 1093.0, 1105.0, 1092.0, 1137.0, 859.0, 1134.0], "score": 1.0, "text": "ral stereo matching."}, {"category_id": 15, "poly": [1178.0, 151.0, 1282.0, 151.0, 1282.0, 186.0, 1178.0, 186.0], "score": 1.0, "text": "Noise: \u00b10"}, {"category_id": 15, "poly": [1178.0, 151.0, 1282.0, 151.0, 1282.0, 186.0, 1178.0, 186.0], "score": 1.0, "text": "Noise: \u00b10"}], "page_info": {"page_no": 3, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 5, "poly": [880.81298828125, 613.750244140625, 1552.5638427734375, 613.750244140625, 1552.5638427734375, 855.9174194335938, 880.81298828125, 855.9174194335938], "score": 0.9999957084655762}, {"category_id": 1, "poly": [862.7925415039062, 158.05548095703125, 1569.6671142578125, 158.05548095703125, 1569.6671142578125, 456.6153869628906, 862.7925415039062, 456.6153869628906], "score": 0.9999922513961792}, {"category_id": 1, "poly": [864.6585083007812, 1061.7374267578125, 1570.4825439453125, 1061.7374267578125, 1570.4825439453125, 1459.7132568359375, 864.6585083007812, 1459.7132568359375], "score": 0.9999921321868896}, {"category_id": 1, "poly": [130.64285278320312, 1519.7022705078125, 836.2221069335938, 1519.7022705078125, 836.2221069335938, 1882.68359375, 130.64285278320312, 1882.68359375], "score": 0.9999898672103882}, {"category_id": 1, "poly": [133.1135711669922, 158.4307861328125, 837.9683837890625, 158.4307861328125, 837.9683837890625, 323.343017578125, 133.1135711669922, 323.343017578125], "score": 0.9999892115592957}, {"category_id": 4, "poly": [132.3511199951172, 1347.8763427734375, 839.7514038085938, 1347.8763427734375, 839.7514038085938, 1476.9757080078125, 132.3511199951172, 1476.9757080078125], "score": 0.9999880790710449}, {"category_id": 7, "poly": [887.6280517578125, 860.9362182617188, 1551.5972900390625, 860.9362182617188, 1551.5972900390625, 964.0142211914062, 887.6280517578125, 964.0142211914062], "score": 0.9999836683273315}, {"category_id": 1, "poly": [869.9986572265625, 1514.7762451171875, 1571.624755859375, 1514.7762451171875, 1571.624755859375, 2022.618896484375, 869.9986572265625, 2022.618896484375], "score": 0.9999811053276062}, {"category_id": 3, "poly": [164.82151794433594, 352.74810791015625, 805.8219604492188, 352.74810791015625, 805.8219604492188, 1320.43310546875, 164.82151794433594, 1320.43310546875], "score": 0.9999799728393555}, {"category_id": 0, "poly": [1137.668701171875, 1477.0120849609375, 1293.498046875, 1477.0120849609375, 1293.498046875, 1502.5439453125, 1137.668701171875, 1502.5439453125], "score": 0.9999679327011108}, {"category_id": 1, "poly": [133.0285186767578, 1886.7501220703125, 837.0147705078125, 1886.7501220703125, 837.0147705078125, 2018.0294189453125, 133.0285186767578, 2018.0294189453125], "score": 0.9999630451202393}, {"category_id": 0, "poly": [1114.8399658203125, 1022.4933471679688, 1317.0313720703125, 1022.4933471679688, 1317.0313720703125, 1052.679931640625, 1114.8399658203125, 1052.679931640625], "score": 0.9999338984489441}, {"category_id": 1, "poly": [862.0576171875, 480.8196105957031, 1565.8367919921875, 480.8196105957031, 1565.8367919921875, 577.5508422851562, 862.0576171875, 577.5508422851562], "score": 0.8958550691604614}, {"category_id": 6, "poly": [862.0606079101562, 480.7809753417969, 1565.667724609375, 480.7809753417969, 1565.667724609375, 577.4689331054688, 862.0606079101562, 577.4689331054688], "score": 0.4145430028438568}, {"category_id": 13, "poly": [736, 1445, 827, 1445, 827, 1475, 736, 1475], "score": 0.9, "latex": "\\lambda=0.8"}, {"category_id": 13, "poly": [1003, 887, 1105, 887, 1105, 911, 1003, 911], "score": 0.89, "latex": "320\\times240"}, {"category_id": 13, "poly": [338, 1446, 391, 1446, 391, 1475, 338, 1475], "score": 0.87, "latex": "\\pm30"}, {"category_id": 13, "poly": [166, 1619, 219, 1619, 219, 1649, 166, 1649], "score": 0.85, "latex": "\\pm40"}, {"category_id": 13, "poly": [301, 196, 329, 196, 329, 224, 301, 224], "score": 0.84, "latex": "\\gamma_{t}"}, {"category_id": 13, "poly": [795, 1586, 836, 1586, 836, 1616, 795, 1616], "score": 0.84, "latex": "\\pm0"}, {"category_id": 13, "poly": [1037, 939, 1059, 939, 1059, 960, 1037, 960], "score": 0.83, "latex": "\\%"}, {"category_id": 13, "poly": [462, 1586, 482, 1586, 482, 1613, 462, 1613], "score": 0.78, "latex": "\\lambda"}, {"category_id": 15, "poly": [862.0, 160.0, 1571.0, 160.0, 1571.0, 192.0, 862.0, 192.0], "score": 0.98, "text": "the proposed implementation achieves the highest speed of"}, {"category_id": 15, "poly": [864.0, 195.0, 1566.0, 195.0, 1566.0, 227.0, 864.0, 227.0], "score": 0.99, "text": "operation measured by the number of disparity hypotheses"}, {"category_id": 15, "poly": [864.0, 227.0, 1568.0, 227.0, 1568.0, 259.0, 864.0, 259.0], "score": 0.99, "text": "evaluated per second, as shown in Table I1. It is also the second"}, {"category_id": 15, "poly": [862.0, 261.0, 1568.0, 261.0, 1568.0, 293.0, 862.0, 293.0], "score": 0.99, "text": "most accurate real-time method in terms of error rate, as"}, {"category_id": 15, "poly": [864.0, 296.0, 1564.0, 296.0, 1564.0, 325.0, 864.0, 325.0], "score": 1.0, "text": "measured using the Middlebury stereo evaluation benchmark."}, {"category_id": 15, "poly": [859.0, 323.0, 1568.0, 325.0, 1568.0, 358.0, 859.0, 355.0], "score": 0.98, "text": " It should be noted that it is difficult to establish an unbiased"}, {"category_id": 15, "poly": [862.0, 358.0, 1566.0, 358.0, 1566.0, 390.0, 862.0, 390.0], "score": 1.0, "text": "metric for speed comparisons, as the architecture, number of"}, {"category_id": 15, "poly": [866.0, 394.0, 1568.0, 394.0, 1568.0, 426.0, 866.0, 426.0], "score": 0.98, "text": "cores, and clock speed of graphics hardware used are not"}, {"category_id": 15, "poly": [862.0, 424.0, 1259.0, 429.0, 1259.0, 461.0, 861.0, 456.0], "score": 0.99, "text": "consistent across implementations."}, {"category_id": 15, "poly": [889.0, 1061.0, 1571.0, 1061.0, 1571.0, 1100.0, 889.0, 1100.0], "score": 1.0, "text": "While the majority of stereo matching algorithms focus"}, {"category_id": 15, "poly": [859.0, 1093.0, 1571.0, 1095.0, 1571.0, 1134.0, 859.0, 1132.0], "score": 0.99, "text": " on achieving high accuracy on still images, the volume of"}, {"category_id": 15, "poly": [862.0, 1130.0, 1564.0, 1130.0, 1564.0, 1162.0, 862.0, 1162.0], "score": 0.99, "text": "research aimed at recovery of temporally consistent disparity"}, {"category_id": 15, "poly": [862.0, 1162.0, 1568.0, 1162.0, 1568.0, 1201.0, 862.0, 1201.0], "score": 0.99, "text": "maps remains disproportionally small. This paper introduces"}, {"category_id": 15, "poly": [862.0, 1196.0, 1568.0, 1196.0, 1568.0, 1235.0, 862.0, 1235.0], "score": 0.98, "text": "an efficient temporal cost aggregation scheme that can easily"}, {"category_id": 15, "poly": [859.0, 1226.0, 1571.0, 1228.0, 1571.0, 1267.0, 859.0, 1265.0], "score": 0.99, "text": "be combined with conventional spatial cost aggregation to"}, {"category_id": 15, "poly": [864.0, 1265.0, 1568.0, 1265.0, 1568.0, 1297.0, 864.0, 1297.0], "score": 1.0, "text": "improve the accuracy of stereo matching when operating on"}, {"category_id": 15, "poly": [864.0, 1297.0, 1568.0, 1297.0, 1568.0, 1329.0, 864.0, 1329.0], "score": 0.99, "text": "video sequences. A synthetic video sequence, along with"}, {"category_id": 15, "poly": [864.0, 1331.0, 1568.0, 1331.0, 1568.0, 1364.0, 864.0, 1364.0], "score": 0.99, "text": "ground truth disparity data, was generated to evaluate the"}, {"category_id": 15, "poly": [862.0, 1361.0, 1571.0, 1361.0, 1571.0, 1400.0, 862.0, 1400.0], "score": 0.98, "text": "performance of the proposed method. It was shown that"}, {"category_id": 15, "poly": [864.0, 1398.0, 1571.0, 1398.0, 1571.0, 1430.0, 864.0, 1430.0], "score": 0.98, "text": "temporal aggregation is significantly more robust to noise than"}, {"category_id": 15, "poly": [862.0, 1430.0, 1497.0, 1430.0, 1497.0, 1462.0, 862.0, 1462.0], "score": 0.99, "text": "a method that only considers the current stereo frames."}, {"category_id": 15, "poly": [157.0, 1517.0, 838.0, 1517.0, 838.0, 1556.0, 157.0, 1556.0], "score": 0.99, "text": "The optimal value of the feedback coefficient is largely"}, {"category_id": 15, "poly": [134.0, 1554.0, 836.0, 1554.0, 836.0, 1584.0, 134.0, 1584.0], "score": 0.97, "text": "dependent on the noise being added to the image. Figure 4"}, {"category_id": 15, "poly": [132.0, 1655.0, 838.0, 1655.0, 838.0, 1684.0, 132.0, 1684.0], "score": 0.99, "text": "rely on the auxiliary cost when noise is high and it is more"}, {"category_id": 15, "poly": [132.0, 1684.0, 839.0, 1689.0, 838.0, 1721.0, 132.0, 1716.0], "score": 0.98, "text": "beneficial to rely on the current cost when noise is low. Figure"}, {"category_id": 15, "poly": [132.0, 1719.0, 839.0, 1723.0, 838.0, 1755.0, 132.0, 1751.0], "score": 1.0, "text": "5 illustrates the improvements that are achieved when applying"}, {"category_id": 15, "poly": [134.0, 1755.0, 836.0, 1755.0, 836.0, 1785.0, 134.0, 1785.0], "score": 0.98, "text": "temporal stereo matching to a particular pair of frames in the"}, {"category_id": 15, "poly": [134.0, 1788.0, 834.0, 1788.0, 834.0, 1820.0, 134.0, 1820.0], "score": 1.0, "text": "synthetic video sequence. Clearly, the noise in the disparity"}, {"category_id": 15, "poly": [134.0, 1822.0, 836.0, 1822.0, 836.0, 1854.0, 134.0, 1854.0], "score": 0.99, "text": "map is drastically reduced when temporal stereo matching is"}, {"category_id": 15, "poly": [132.0, 1856.0, 196.0, 1856.0, 196.0, 1886.0, 132.0, 1886.0], "score": 1.0, "text": "used."}, {"category_id": 15, "poly": [132.0, 1620.0, 165.0, 1620.0, 165.0, 1652.0, 132.0, 1652.0], "score": 0.99, "text": "to"}, {"category_id": 15, "poly": [220.0, 1620.0, 838.0, 1620.0, 838.0, 1652.0, 220.0, 1652.0], "score": 0.98, "text": ". As intuition would suggest, it is more beneficial to"}, {"category_id": 15, "poly": [127.0, 1584.0, 461.0, 1581.0, 461.0, 1620.0, 127.0, 1623.0], "score": 0.96, "text": " shows the optimal values of"}, {"category_id": 15, "poly": [483.0, 1584.0, 794.0, 1581.0, 794.0, 1620.0, 483.0, 1623.0], "score": 0.99, "text": "for noise ranging between"}, {"category_id": 15, "poly": [134.0, 160.0, 836.0, 160.0, 836.0, 192.0, 134.0, 192.0], "score": 0.99, "text": "over the most recent frames. By maintaining a reasonably"}, {"category_id": 15, "poly": [134.0, 229.0, 836.0, 229.0, 836.0, 261.0, 134.0, 261.0], "score": 0.98, "text": "edges, essentially reducing over-smoothing of a pixel's dis-"}, {"category_id": 15, "poly": [132.0, 261.0, 838.0, 261.0, 838.0, 293.0, 132.0, 293.0], "score": 0.99, "text": "parity when a pixel transitions from one depth to another in"}, {"category_id": 15, "poly": [130.0, 293.0, 354.0, 296.0, 353.0, 328.0, 129.0, 325.0], "score": 1.0, "text": "subsequent frames."}, {"category_id": 15, "poly": [134.0, 192.0, 300.0, 192.0, 300.0, 225.0, 134.0, 225.0], "score": 0.93, "text": "high value of"}, {"category_id": 15, "poly": [330.0, 192.0, 836.0, 192.0, 836.0, 225.0, 330.0, 225.0], "score": 0.99, "text": ", the auxiliary cost also preserves temporal"}, {"category_id": 15, "poly": [132.0, 1345.0, 836.0, 1348.0, 836.0, 1382.0, 132.0, 1380.0], "score": 1.0, "text": "Figure 5: A comparison of stereo matching without temporal"}, {"category_id": 15, "poly": [132.0, 1382.0, 834.0, 1382.0, 834.0, 1414.0, 132.0, 1414.0], "score": 0.98, "text": "cost aggregation (top\uff09 and with temporal cost aggregation"}, {"category_id": 15, "poly": [134.0, 1416.0, 836.0, 1416.0, 836.0, 1446.0, 134.0, 1446.0], "score": 0.98, "text": "(bottom) for a single frame in the synthetic video sequence"}, {"category_id": 15, "poly": [134.0, 1448.0, 337.0, 1446.0, 337.0, 1478.0, 134.0, 1480.0], "score": 0.98, "text": "where the noise is"}, {"category_id": 15, "poly": [392.0, 1448.0, 735.0, 1446.0, 735.0, 1478.0, 392.0, 1480.0], "score": 0.99, "text": "and the feedback coefficient is"}, {"category_id": 15, "poly": [896.0, 855.0, 1324.0, 857.0, 1323.0, 896.0, 896.0, 894.0], "score": 0.95, "text": "1I Millions of Disparity Estimates per Second."}, {"category_id": 15, "poly": [903.0, 912.0, 1550.0, 912.0, 1550.0, 944.0, 903.0, 944.0], "score": 0.99, "text": "3 As measured by the Middlebury stereo performance benchmark using"}, {"category_id": 15, "poly": [901.0, 887.0, 1002.0, 887.0, 1002.0, 919.0, 901.0, 919.0], "score": 0.99, "text": "2Assumes"}, {"category_id": 15, "poly": [1106.0, 887.0, 1404.0, 887.0, 1404.0, 919.0, 1106.0, 919.0], "score": 0.98, "text": "images with 32 disparity levels."}, {"category_id": 15, "poly": [915.0, 937.0, 1036.0, 937.0, 1036.0, 969.0, 915.0, 969.0], "score": 0.96, "text": "the avgerage"}, {"category_id": 15, "poly": [1060.0, 937.0, 1192.0, 937.0, 1192.0, 969.0, 1060.0, 969.0], "score": 0.96, "text": "of bad pixels."}, {"category_id": 15, "poly": [873.0, 1515.0, 1571.0, 1515.0, 1571.0, 1545.0, 873.0, 1545.0], "score": 0.97, "text": "[1] D. Scharstein and R. Szeliski, \u201cA taxonomy and evaluation of dense "}, {"category_id": 15, "poly": [915.0, 1542.0, 1573.0, 1542.0, 1573.0, 1572.0, 915.0, 1572.0], "score": 0.98, "text": "two-frame stereo correspondence algorithms\u201d\u2019 International Journal of"}, {"category_id": 15, "poly": [915.0, 1565.0, 1409.0, 1565.0, 1409.0, 1597.0, 915.0, 1597.0], "score": 0.98, "text": "Computer Vision, vol. 47, pp. 7-42, April-June 2002."}, {"category_id": 15, "poly": [871.0, 1588.0, 1568.0, 1590.0, 1568.0, 1623.0, 871.0, 1620.0], "score": 0.98, "text": "[2] D. Scharstein and R. Szeliski, \u201cHigh-accuracy stereo depth maps using"}, {"category_id": 15, "poly": [915.0, 1616.0, 1568.0, 1616.0, 1568.0, 1648.0, 915.0, 1648.0], "score": 0.97, "text": "structured light,\u201d in In IEEE Computer Society Conference on Computer"}, {"category_id": 15, "poly": [915.0, 1641.0, 1508.0, 1641.0, 1508.0, 1673.0, 915.0, 1673.0], "score": 0.98, "text": "Vision and Pattern Recognition, vol. 1, pp. 195-202, June 2003."}, {"category_id": 15, "poly": [873.0, 1666.0, 1568.0, 1666.0, 1568.0, 1696.0, 873.0, 1696.0], "score": 0.99, "text": "[3] J. Kowalczuk, E. Psota, and L. Perez, \u201cReal-time stereo matching on"}, {"category_id": 15, "poly": [912.0, 1689.0, 1571.0, 1689.0, 1571.0, 1721.0, 912.0, 1721.0], "score": 0.98, "text": " CUDA using an iterative refinement method for adaptive support-weight"}, {"category_id": 15, "poly": [915.0, 1714.0, 1571.0, 1714.0, 1571.0, 1746.0, 915.0, 1746.0], "score": 0.99, "text": "correspondences,\u201d Circuits and Systems for Video Technology, IEEE"}, {"category_id": 15, "poly": [908.0, 1737.0, 1374.0, 1735.0, 1374.0, 1774.0, 908.0, 1776.0], "score": 0.96, "text": "Transactions on, vol. 23, Ppp. 94 -104, Jan. 2013."}, {"category_id": 15, "poly": [873.0, 1765.0, 1568.0, 1765.0, 1568.0, 1797.0, 873.0, 1797.0], "score": 0.99, "text": "[4] K.-J. Yoon and I.-S. Kweon, Locally adaptive support-weight approach"}, {"category_id": 15, "poly": [912.0, 1790.0, 1571.0, 1790.0, 1571.0, 1822.0, 912.0, 1822.0], "score": 0.97, "text": "for visual correspondence search,' in CVPR'05: Proceedings of the 2005"}, {"category_id": 15, "poly": [915.0, 1815.0, 1571.0, 1815.0, 1571.0, 1847.0, 915.0, 1847.0], "score": 0.96, "text": "IEEE Computer Society Conference on ComputerVision andPattern"}, {"category_id": 15, "poly": [915.0, 1840.0, 1568.0, 1840.0, 1568.0, 1872.0, 915.0, 1872.0], "score": 0.97, "text": "Recognition (CVPR'05) - Volume 2, (Washington, DC, USA), Pp. 924-"}, {"category_id": 15, "poly": [912.0, 1863.0, 1247.0, 1863.0, 1247.0, 1895.0, 912.0, 1895.0], "score": 0.98, "text": "931, IEEE Computer Society, 2005."}, {"category_id": 15, "poly": [873.0, 1891.0, 1568.0, 1891.0, 1568.0, 1923.0, 873.0, 1923.0], "score": 0.97, "text": "[5] L. Wang, M. Liao, M. Gong, R. Yang, and D. Nister, \u201cHigh-quality real-"}, {"category_id": 15, "poly": [912.0, 1916.0, 1566.0, 1916.0, 1566.0, 1946.0, 912.0, 1946.0], "score": 0.99, "text": "time stereo using adaptive cost aggregation and dynamic programming,\""}, {"category_id": 15, "poly": [910.0, 1936.0, 1568.0, 1939.0, 1568.0, 1971.0, 910.0, 1969.0], "score": 0.94, "text": "in 3DPVT'06:Proceedings of the Third International Symposium"}, {"category_id": 15, "poly": [915.0, 1964.0, 1568.0, 1964.0, 1568.0, 1996.0, 915.0, 1996.0], "score": 0.98, "text": "on 3D Data Processing, Visualization, and Transmission (3DPVT'06),"}, {"category_id": 15, "poly": [915.0, 1989.0, 1564.0, 1989.0, 1564.0, 2021.0, 915.0, 2021.0], "score": 1.0, "text": "(Washington, DC, USA), Pp. 798-805, IEEE Computer Society, 2006."}, {"category_id": 15, "poly": [1134.0, 1471.0, 1296.0, 1471.0, 1296.0, 1510.0, 1134.0, 1510.0], "score": 1.0, "text": "REFERENCES"}, {"category_id": 15, "poly": [159.0, 1888.0, 836.0, 1888.0, 836.0, 1920.0, 159.0, 1920.0], "score": 0.99, "text": "The algorithm was implement using NVIDIA's Compute"}, {"category_id": 15, "poly": [134.0, 1920.0, 834.0, 1920.0, 834.0, 1950.0, 134.0, 1950.0], "score": 0.98, "text": "Unified Device Architecture (CUDA). The details of the im-"}, {"category_id": 15, "poly": [129.0, 1948.0, 841.0, 1950.0, 841.0, 1989.0, 129.0, 1987.0], "score": 0.98, "text": " plementation are similar to those given in [3]. When compared "}, {"category_id": 15, "poly": [132.0, 1989.0, 836.0, 1989.0, 836.0, 2021.0, 132.0, 2021.0], "score": 0.99, "text": "to other existing real-time stereo matching implementations,"}, {"category_id": 15, "poly": [1111.0, 1022.0, 1317.0, 1022.0, 1317.0, 1061.0, 1111.0, 1061.0], "score": 1.0, "text": "V. CONCLUSION"}, {"category_id": 15, "poly": [864.0, 484.0, 1564.0, 484.0, 1564.0, 516.0, 864.0, 516.0], "score": 0.99, "text": "Table II: A comparison of speed and accuracy for the imple-"}, {"category_id": 15, "poly": [864.0, 518.0, 1564.0, 518.0, 1564.0, 550.0, 864.0, 550.0], "score": 0.99, "text": "mentations of many leading real-time stereo matching meth-"}, {"category_id": 15, "poly": [862.0, 550.0, 917.0, 550.0, 917.0, 584.0, 862.0, 584.0], "score": 0.96, "text": "ods."}, {"category_id": 15, "poly": [864.0, 484.0, 1564.0, 484.0, 1564.0, 516.0, 864.0, 516.0], "score": 0.99, "text": "Table II: A comparison of speed and accuracy for the imple-"}, {"category_id": 15, "poly": [864.0, 518.0, 1564.0, 518.0, 1564.0, 550.0, 864.0, 550.0], "score": 0.99, "text": "mentations of many leading real-time stereo matching meth-"}, {"category_id": 15, "poly": [862.0, 550.0, 917.0, 550.0, 917.0, 584.0, 862.0, 584.0], "score": 0.96, "text": "ods."}], "page_info": {"page_no": 4, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 1, "poly": [134.58497619628906, 157.681884765625, 841.3460693359375, 157.681884765625, 841.3460693359375, 1666.27001953125, 134.58497619628906, 1666.27001953125], "score": 0.9999936819076538}, {"category_id": 15, "poly": [143.0, 163.0, 838.0, 163.0, 838.0, 192.0, 143.0, 192.0], "score": 0.97, "text": "[6] W. Yu, T. Chen, F. Franchetti, and J. C. Hoe, \u201cHigh performance stereo"}, {"category_id": 15, "poly": [182.0, 188.0, 838.0, 188.0, 838.0, 218.0, 182.0, 218.0], "score": 0.98, "text": "vision designed for massively data parallel platforms,\u2019 Circuits and"}, {"category_id": 15, "poly": [182.0, 213.0, 841.0, 213.0, 841.0, 245.0, 182.0, 245.0], "score": 0.98, "text": "Systems for Video Technology, IEEE Transactions on, vol. 20, pp. 1509"}, {"category_id": 15, "poly": [182.0, 238.0, 411.0, 238.0, 411.0, 268.0, 182.0, 268.0], "score": 0.98, "text": "-1519, November 2010."}, {"category_id": 15, "poly": [143.0, 264.0, 838.0, 264.0, 838.0, 293.0, 143.0, 293.0], "score": 0.99, "text": "[7] S. Mattoccia, M. Viti, and F. Ries, \u201cNear real-time fast bilateral stereo"}, {"category_id": 15, "poly": [182.0, 289.0, 838.0, 289.0, 838.0, 319.0, 182.0, 319.0], "score": 0.96, "text": "on the GPU in Computer Vision and Pattern Recognition Workshops"}, {"category_id": 15, "poly": [178.0, 307.0, 841.0, 309.0, 841.0, 348.0, 178.0, 346.0], "score": 0.95, "text": "(CVPRW), 2011 IEEE Computer Society Conference on,Ppp. 136 -143,"}, {"category_id": 15, "poly": [185.0, 339.0, 289.0, 339.0, 289.0, 364.0, 185.0, 364.0], "score": 0.98, "text": "June 2011."}, {"category_id": 15, "poly": [141.0, 362.0, 838.0, 362.0, 838.0, 392.0, 141.0, 392.0], "score": 0.98, "text": "[8] K. Zhang, J. Lu, Q. Yang, G. Lafruit, R. Lauwereins, and L. Van Gool,"}, {"category_id": 15, "poly": [182.0, 387.0, 838.0, 387.0, 838.0, 419.0, 182.0, 419.0], "score": 0.98, "text": "\"Real-time and accurate stereo: A scalable approach with bitwise fast"}, {"category_id": 15, "poly": [185.0, 412.0, 838.0, 412.0, 838.0, 445.0, 185.0, 445.0], "score": 0.97, "text": "voting on CUDA,\u201d Circuits and Systems for Video Technology, IEEE"}, {"category_id": 15, "poly": [182.0, 438.0, 656.0, 438.0, 656.0, 468.0, 182.0, 468.0], "score": 0.99, "text": "Transactions on, vol. 21, pp. 867 -878, July 2011."}, {"category_id": 15, "poly": [141.0, 463.0, 838.0, 463.0, 838.0, 493.0, 141.0, 493.0], "score": 0.96, "text": "[9] C. Rhemann, A. Hosni, M. Bleyer, C. Rother, and M. Gelautz, \u201cFast cost-"}, {"category_id": 15, "poly": [182.0, 488.0, 838.0, 488.0, 838.0, 518.0, 182.0, 518.0], "score": 0.98, "text": "volume filtering for visual correspondence and beyond,\" in Computer"}, {"category_id": 15, "poly": [180.0, 509.0, 841.0, 511.0, 841.0, 543.0, 180.0, 541.0], "score": 0.95, "text": "Vision and Pattern Recognition (CVPR), 20ll IEEE Conference on,"}, {"category_id": 15, "poly": [180.0, 536.0, 448.0, 534.0, 448.0, 566.0, 180.0, 568.0], "score": 0.99, "text": "Pp. 3017 -3024, June 2011."}, {"category_id": 15, "poly": [134.0, 561.0, 838.0, 561.0, 838.0, 591.0, 134.0, 591.0], "score": 0.99, "text": "[10] A. Hosni, C. Rhemann, M. Bleyer, and M. Gelautz, \u201cTemporally con-"}, {"category_id": 15, "poly": [180.0, 587.0, 836.0, 587.0, 836.0, 616.0, 180.0, 616.0], "score": 0.99, "text": " sistent disparity and optical flow via efficient spatio-temporal filtering,\""}, {"category_id": 15, "poly": [182.0, 612.0, 838.0, 612.0, 838.0, 642.0, 182.0, 642.0], "score": 0.97, "text": "in Advances in Image and Video Technology (Y.-S. Ho, ed.), vol. 7087"}, {"category_id": 15, "poly": [180.0, 632.0, 845.0, 632.0, 845.0, 671.0, 180.0, 671.0], "score": 0.88, "text": "of Lectureotes inComputer Science,pp.16517,Springererlin /"}, {"category_id": 15, "poly": [182.0, 660.0, 353.0, 660.0, 353.0, 692.0, 182.0, 692.0], "score": 1.0, "text": "Heidelberg, 2012."}, {"category_id": 15, "poly": [134.0, 685.0, 838.0, 685.0, 838.0, 717.0, 134.0, 717.0], "score": 0.98, "text": "[11] C. Tomasi and R. Manduchi, \u201cBilateral filtering for gray and color"}, {"category_id": 15, "poly": [182.0, 710.0, 838.0, 710.0, 838.0, 742.0, 182.0, 742.0], "score": 0.98, "text": "images,\u201d in Computer Vision, 1998. Sixth International Conference on,"}, {"category_id": 15, "poly": [180.0, 736.0, 411.0, 731.0, 411.0, 763.0, 181.0, 768.0], "score": 0.93, "text": "pPp. 839 -846, jan 1998."}, {"category_id": 15, "poly": [132.0, 761.0, 838.0, 761.0, 838.0, 791.0, 132.0, 791.0], "score": 0.97, "text": "[12] K. He, J. Sun, and X. Tang, \u201cGuided image filtering,\u201d\u2019 in Computer"}, {"category_id": 15, "poly": [180.0, 784.0, 838.0, 786.0, 838.0, 818.0, 180.0, 816.0], "score": 0.98, "text": "Vision - ECCV 2010, vol. 6311 of Lecture Notes in Computer Science,"}, {"category_id": 15, "poly": [180.0, 811.0, 607.0, 807.0, 608.0, 839.0, 180.0, 843.0], "score": 0.98, "text": "pp. 1-14, Springer Berlin / Heidelberg, 2010."}, {"category_id": 15, "poly": [129.0, 832.0, 839.0, 837.0, 838.0, 869.0, 129.0, 864.0], "score": 0.98, "text": "[13] L. Zhang, B. Curless, and S. M. Seitz, \u201cSpacetime stereo: Shape"}, {"category_id": 15, "poly": [182.0, 862.0, 836.0, 862.0, 836.0, 891.0, 182.0, 891.0], "score": 0.98, "text": "recovery for dynamic scenes,\u201d in IEEE Computer Society Conference"}, {"category_id": 15, "poly": [182.0, 885.0, 834.0, 885.0, 834.0, 917.0, 182.0, 917.0], "score": 0.97, "text": "on Computer Vision and Pattern Recognition, pp. 367-374, June 2003."}, {"category_id": 15, "poly": [132.0, 910.0, 838.0, 910.0, 838.0, 940.0, 132.0, 940.0], "score": 0.98, "text": "[14] J. Davis, D. Nehab, R. Ramamoorthi, and S. Rusinkiewicz, \u201cSpacetime"}, {"category_id": 15, "poly": [182.0, 935.0, 838.0, 935.0, 838.0, 965.0, 182.0, 965.0], "score": 0.97, "text": "stereo: a unifying framework for depth from triangulation,\u201d\u2019 Pattern"}, {"category_id": 15, "poly": [182.0, 960.0, 838.0, 960.0, 838.0, 990.0, 182.0, 990.0], "score": 0.98, "text": "Analysis and Machine Intelligence, IEEE Transactions on,vol. 27,"}, {"category_id": 15, "poly": [180.0, 983.0, 462.0, 983.0, 462.0, 1015.0, 180.0, 1015.0], "score": 0.97, "text": "Pp. 296 -302, February 2005."}, {"category_id": 15, "poly": [132.0, 1011.0, 838.0, 1011.0, 838.0, 1040.0, 132.0, 1040.0], "score": 0.99, "text": "[15] E. Larsen, P. Mordohai, M. Pollefeys, and H. Fuchs, \u201cTemporally"}, {"category_id": 15, "poly": [182.0, 1036.0, 836.0, 1036.0, 836.0, 1066.0, 182.0, 1066.0], "score": 0.99, "text": "consistent reconstruction from multiple video streams using enhanced"}, {"category_id": 15, "poly": [178.0, 1054.0, 843.0, 1056.0, 843.0, 1095.0, 178.0, 1093.0], "score": 0.95, "text": "belief propagation in Computer Vision, 2007.ICCV 2007. IEEE1lth"}, {"category_id": 15, "poly": [180.0, 1082.0, 644.0, 1082.0, 644.0, 1121.0, 180.0, 1121.0], "score": 0.97, "text": "International Conference on, pp. 1 -8, oct. 2007."}, {"category_id": 15, "poly": [134.0, 1109.0, 838.0, 1109.0, 838.0, 1141.0, 134.0, 1141.0], "score": 0.97, "text": "[16] M. Bleyer, M. Gelautz, C. Rother, and C. Rhemann, \u201c\"A stereo approach"}, {"category_id": 15, "poly": [180.0, 1134.0, 838.0, 1134.0, 838.0, 1166.0, 180.0, 1166.0], "score": 0.99, "text": "that handles the mating problem via image warping\" in Computer"}, {"category_id": 15, "poly": [182.0, 1157.0, 838.0, 1157.0, 838.0, 1189.0, 182.0, 1189.0], "score": 0.98, "text": "Vision and Pattern Recognition, 2009. CVPR 2009. IEEE Conference"}, {"category_id": 15, "poly": [180.0, 1183.0, 459.0, 1175.0, 460.0, 1212.0, 181.0, 1219.0], "score": 0.98, "text": "on, pp. 501 -508, June 2009."}, {"category_id": 15, "poly": [129.0, 1205.0, 838.0, 1208.0, 838.0, 1240.0, 129.0, 1237.0], "score": 0.98, "text": " [17] M. Sizintsev and R. Wildes, \u201cSpatiotemporal stereo via spatiotemporal"}, {"category_id": 15, "poly": [182.0, 1235.0, 838.0, 1235.0, 838.0, 1265.0, 182.0, 1265.0], "score": 0.97, "text": "quadric element (stequel) matching,\u201d in Computer Vision and Pattern"}, {"category_id": 15, "poly": [185.0, 1258.0, 841.0, 1258.0, 841.0, 1290.0, 185.0, 1290.0], "score": 0.98, "text": "Recognition, 2009. CVPR 2009. IEEE Conference on, Pp. 493 -500,"}, {"category_id": 15, "poly": [185.0, 1286.0, 286.0, 1286.0, 286.0, 1311.0, 185.0, 1311.0], "score": 0.99, "text": "june 2009."}, {"category_id": 15, "poly": [132.0, 1309.0, 838.0, 1309.0, 838.0, 1338.0, 132.0, 1338.0], "score": 0.97, "text": "[18] M. Sizintsev and R. Wildes, \u201cSpatiotemporal stereo and scene flow via"}, {"category_id": 15, "poly": [182.0, 1334.0, 841.0, 1334.0, 841.0, 1364.0, 182.0, 1364.0], "score": 0.97, "text": "stequel matching,\u201d\u2019Pattern Analysis and Machine Intelligence, IEEE"}, {"category_id": 15, "poly": [182.0, 1359.0, 684.0, 1359.0, 684.0, 1391.0, 182.0, 1391.0], "score": 1.0, "text": "Transactions on, vol. 34, pp. 1206 -1219, june 2012."}, {"category_id": 15, "poly": [132.0, 1382.0, 834.0, 1382.0, 834.0, 1412.0, 132.0, 1412.0], "score": 0.98, "text": "[19] C. Richardt, D. Orr, I. Davies, A. Criminisi, and N. A. Dodgson,"}, {"category_id": 15, "poly": [185.0, 1409.0, 838.0, 1409.0, 838.0, 1441.0, 185.0, 1441.0], "score": 0.98, "text": "\"Real-time spatiotemporal stereo matching using the dual-cross-bilateral"}, {"category_id": 15, "poly": [182.0, 1432.0, 838.0, 1432.0, 838.0, 1464.0, 182.0, 1464.0], "score": 0.95, "text": "grid,\" in Proceedings of the European Conference on Computer Vision"}, {"category_id": 15, "poly": [182.0, 1458.0, 838.0, 1458.0, 838.0, 1490.0, 182.0, 1490.0], "score": 0.98, "text": "(ECCV), Lecture Notes in Computer Science, pp. 510-523, September"}, {"category_id": 15, "poly": [182.0, 1477.0, 243.0, 1483.0, 241.0, 1511.0, 179.0, 1505.0], "score": 1.0, "text": "2010."}, {"category_id": 15, "poly": [134.0, 1508.0, 836.0, 1508.0, 836.0, 1538.0, 134.0, 1538.0], "score": 0.98, "text": "[20] S. Paris and F. Durand, \u201cA fast approximation of the bilateral filter using"}, {"category_id": 15, "poly": [182.0, 1533.0, 836.0, 1533.0, 836.0, 1565.0, 182.0, 1565.0], "score": 0.98, "text": "a signal processing approach,\u201d Int. J. Comput. Vision, vol. 81, pp. 24-52,"}, {"category_id": 15, "poly": [185.0, 1561.0, 282.0, 1561.0, 282.0, 1586.0, 185.0, 1586.0], "score": 0.98, "text": "Jan. 2009."}, {"category_id": 15, "poly": [134.0, 1584.0, 836.0, 1584.0, 836.0, 1613.0, 134.0, 1613.0], "score": 0.98, "text": "[21] Q. Yang, L. Wang, R. Yang, S. Wang, M. Liao, and D. Nist\u00e9r, \u201cReal-"}, {"category_id": 15, "poly": [182.0, 1609.0, 838.0, 1609.0, 838.0, 1641.0, 182.0, 1641.0], "score": 0.98, "text": "time global stereo matching using hierarchical belief propagation.\u201d in"}, {"category_id": 15, "poly": [182.0, 1634.0, 698.0, 1634.0, 698.0, 1666.0, 182.0, 1666.0], "score": 1.0, "text": "British Machine Vision Conference, pp. 989-998, 2006."}], "page_info": {"page_no": 5, "height": 2200, "width": 1700}}] \ No newline at end of file diff --git a/demo/demo2.pdf b/demo/demo2.pdf new file mode 100644 index 0000000000000000000000000000000000000000..c8659f212cfb28350c1d71c44bd84fe58df2ca24 --- /dev/null +++ b/demo/demo2.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e94e95637356e1599510436278747d1150a3dfb822233bdc77a9dcb9a4fc6e4 +size 1808096 diff --git a/docs/FAQ_zh_cn.md b/docs/FAQ_zh_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..c5f76e1017a77b8fcd08091f2bebe09116ec1154 --- /dev/null +++ b/docs/FAQ_zh_cn.md @@ -0,0 +1,85 @@ +# 常见问题解答 + +### 1.离线部署首次运行,报错urllib.error.URLError: + +首次运行需要在线下载一个小的语言检测模型,如果是离线部署需要手动下载该模型并放到指定目录。 +参考:https://github.com/opendatalab/MinerU/issues/121 + +### 2.在较新版本的mac上使用命令安装pip install magic-pdf[full-cpu] zsh: no matches found: magic-pdf[full-cpu] + +在 macOS 上,默认的 shell 从 Bash 切换到了 Z shell,而 Z shell 对于某些类型的字符串匹配有特殊的处理逻辑,这可能导致no matches found错误。 +可以通过在命令行禁用globbing特性,再尝试运行安装命令 +```bash +setopt no_nomatch +pip install magic-pdf[full-cpu] +``` + +### 3.在intel cpu 的mac上 安装最新版的完整功能包 magic-pdf[full-cpu] (0.6.x) 不成功 + +完整功能包依赖的公式解析库unimernet限制了pytorch的最低版本为2.3.0,而pytorch官方没有为intel cpu的macOS 提供2.3.0版本的预编译包,所以会产生依赖不兼容的问题。 +可以先尝试安装unimernet的老版本之后再尝试安装完整功能包的其他依赖。(为避免依赖冲突,请激活一个全新的虚拟环境) +```bash +pip install magic-pdf +pip install unimernet==0.1.0 +pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle +``` + +### 4.在部分较新的M芯片macOS设备上,MPS加速开启失败 + +卸载torch和torchvision,重新安装nightly构建版torch和torchvision +```bash +pip uninstall torch torchvision +pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu +``` +参考: https://github.com/opendatalab/PDF-Extract-Kit/issues/23 + +### 5.使用过程中遇到paddle相关的报错FatalError: Illegal instruction is detected by the operating system. + +paddlepaddle 2.6.1与部分linux系统环境存在兼容性问题。 +可尝试降级到2.5.2使用, +```bash +pip install paddlepaddle==2.5.2 +``` +或卸载paddlepaddle,重新安装paddlepaddle-gpu +```bash +pip uninstall paddlepaddle +pip install paddlepaddle-gpu +``` +参考:https://github.com/opendatalab/MinerU/issues/146 + +### 6.使用过程中遇到_pickle.UnpicklingError: invalid load key, 'v'.错误 + +可能是由于模型文件未下载完整导致,可尝试重现下载模型文件后再试 +参考:https://github.com/opendatalab/MinerU/issues/143 + +### 7.程序运行完成后,找不到tmp目录 + +程序输出目录是在"magic-pdf.json"中通过 +```json +{ + "temp-output-dir": "/tmp" +} +``` +进行配置的。 +如果没有更改这个参数,使用默认的配置执行程序,在linux/macOS会在绝对路径"/tmp"下创建一个"magic-pdf"文件夹作为输出路径。 +而在windows下,默认的输出路径与执行命令时,命令行所在的盘符相关,如果命令行在C盘,则默认输出路径为"C://tmp/magic-pdf"。 +参考:https://github.com/opendatalab/MinerU/issues/149 + +### 8.模型文件应该下载到哪里/models-dir的配置应该怎么填 + +模型文件的路径输入是在"magic-pdf.json"中通过 +```json +{ + "models-dir": "/tmp/models" +} +``` +进行配置的。 +这个路径是绝对路径而不是相对路径,绝对路径的获取可在models目录中通过命令 "pwd" 获取。 +参考:https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874 + +### 9.命令行中 --model "model_json_path" 指的是什么? + +model_json 指的是通过模型分析后生成的一种有特定格式的json文件。 +如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成,该文件一般在项目的output目录下。 +如果使用 MinerU 的命令行调用内置的模型分析,该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。 +参考:https://github.com/opendatalab/MinerU/issues/128 \ No newline at end of file diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md new file mode 100644 index 0000000000000000000000000000000000000000..7ae4982418dde1b3544c4161cfec3af6dd183007 --- /dev/null +++ b/docs/how_to_download_models_en.md @@ -0,0 +1,60 @@ +### Install Git LFS +Before you begin, make sure Git Large File Storage (Git LFS) is installed on your system. Install it using the following command: + +```bash +git lfs install +``` + +### Download the Model from Hugging Face +To download the `PDF-Extract-Kit` model from Hugging Face, use the following command: + +```bash +git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit +``` + +Ensure that Git LFS is enabled during the clone to properly download all large files. + + + +### Download the Model from ModelScope + +#### SDK Download + +```bash +# First, install the ModelScope library using pip: +pip install modelscope +``` + +```python +# Use the following Python code to download the model using the ModelScope SDK: +from modelscope import snapshot_download +model_dir = snapshot_download('wanderkid/PDF-Extract-Kit') +``` + +#### Git Download +Alternatively, you can use Git to clone the model repository from ModelScope: + +```bash +git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git +``` + + +Put [model files]() here: + +``` +./ +├── Layout +│ ├── config.json +│ └── weights.pth +├── MFD +│ └── weights.pt +├── MFR +│ └── UniMERNet +│ ├── config.json +│ ├── preprocessor_config.json +│ ├── pytorch_model.bin +│ ├── README.md +│ ├── tokenizer_config.json +│ └── tokenizer.json +└── README.md +``` \ No newline at end of file diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..d35a33360594139c50ba4a80747f37bab476b0a8 --- /dev/null +++ b/docs/how_to_download_models_zh_cn.md @@ -0,0 +1,61 @@ +### 安装 Git LFS +开始之前,请确保您的系统上已安装 Git 大文件存储 (Git LFS)。使用以下命令进行安装 + +```bash +git lfs install +``` + +### 从 Hugging Face 下载模型 +请使用以下命令从 Hugging Face 下载 PDF-Extract-Kit 模型: + +```bash +git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit +``` + +确保在克隆过程中启用了 Git LFS,以便正确下载所有大文件。 + + +### 从 ModelScope 下载模型 + +#### SDK下载 + +```bash +# 首先安装modelscope +pip install modelscope +``` + +```python +# 使用modelscope sdk下载模型 +from modelscope import snapshot_download +model_dir = snapshot_download('wanderkid/PDF-Extract-Kit') +``` + +#### Git下载 +也可以使用git clone从 ModelScope 下载模型: + +```bash +git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git +``` + + +将 'models' 目录移动到具有较大磁盘空间的目录中,最好是在固态硬盘(SSD)上。 + + +模型文件夹的结构如下,包含了不同组件的配置文件和权重文件: +``` +./ +├── Layout +│ ├── config.json +│ └── model_final.pth +├── MFD +│ └── weights.pt +├── MFR +│ └── UniMERNet +│ ├── config.json +│ ├── preprocessor_config.json +│ ├── pytorch_model.bin +│ ├── README.md +│ ├── tokenizer_config.json +│ └── tokenizer.json +└── README.md +``` diff --git a/docs/images/flowchart_en.png b/docs/images/flowchart_en.png new file mode 100644 index 0000000000000000000000000000000000000000..b490011ea7a9edebeb9edbf3980fc971d89bb76a Binary files /dev/null and b/docs/images/flowchart_en.png differ diff --git a/docs/images/flowchart_zh_cn.png b/docs/images/flowchart_zh_cn.png new file mode 100644 index 0000000000000000000000000000000000000000..32e0a14233972666cf4507af178c77cfe41e3311 Binary files /dev/null and b/docs/images/flowchart_zh_cn.png differ diff --git a/docs/images/project_panorama_en.png b/docs/images/project_panorama_en.png new file mode 100644 index 0000000000000000000000000000000000000000..19616da641076c037c47919c0d3de9efb8e409da Binary files /dev/null and b/docs/images/project_panorama_en.png differ diff --git a/docs/images/project_panorama_zh_cn.png b/docs/images/project_panorama_zh_cn.png new file mode 100644 index 0000000000000000000000000000000000000000..3cd6843e05b827cb0a7c9cc31855de0d3f4645c5 Binary files /dev/null and b/docs/images/project_panorama_zh_cn.png differ diff --git a/magic-pdf.template.json b/magic-pdf.template.json new file mode 100644 index 0000000000000000000000000000000000000000..2c0223db0deb5e54543b92e073bd598e60047d73 --- /dev/null +++ b/magic-pdf.template.json @@ -0,0 +1,9 @@ +{ + "bucket_info":{ + "bucket-name-1":["ak", "sk", "endpoint"], + "bucket-name-2":["ak", "sk", "endpoint"] + }, + "temp-output-dir":"/tmp", + "models-dir":"/tmp/models", + "device-mode":"cpu" +} \ No newline at end of file diff --git a/magic_pdf/__init__.py b/magic_pdf/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/cli/__init__.py b/magic_pdf/cli/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/cli/magicpdf.py b/magic_pdf/cli/magicpdf.py new file mode 100644 index 0000000000000000000000000000000000000000..5ff6bc8b50edf47e82e03d8f5d4b4b7c9311767f --- /dev/null +++ b/magic_pdf/cli/magicpdf.py @@ -0,0 +1,359 @@ +""" +这里实现2个click命令: +第一个: + 接收一个完整的s3路径,例如:s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350 + 1)根据~/magic-pdf.json里的ak,sk等,构造s3cliReader读取到这个jsonl的对应行,返回json对象。 + 2)根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint,构造出s3cliReader用来读取pdf + 3)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter,用来保存截图 + 4)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件 + + 最后把以上步骤准备好的对象传入真正的解析API + +第二个: + 接收1)pdf的本地路径。2)模型json文件(可选)。然后: + 1)根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置,构造出LocalImageWriter,用来保存截图 + 2)从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter,用来读写本地文件 + 3)根据约定,根据pdf本地路径,推导出pdf模型的json,并读入 + + +效果: +python magicpdf.py json-command --json s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350 +python magicpdf.py pdf-command --pdf /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json 或者 python magicpdf.py --pdf /home/llm/Downloads/xxxx.pdf +""" + +import os +import json as json_parse +import click +from loguru import logger +from pathlib import Path +from magic_pdf.libs.version import __version__ + +from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode +from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox +from magic_pdf.pipe.UNIPipe import UNIPipe +from magic_pdf.pipe.OCRPipe import OCRPipe +from magic_pdf.pipe.TXTPipe import TXTPipe +from magic_pdf.libs.path_utils import ( + parse_s3path, + parse_s3_range_params, + remove_non_official_s3_args, +) +from magic_pdf.libs.config_reader import ( + get_local_dir, + get_s3_config, +) +from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter +from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +import csv +import copy +import magic_pdf.model as model_config + +parse_pdf_methods = click.Choice(["ocr", "txt", "auto"]) + + +def prepare_env(pdf_file_name, method): + local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method) + + local_image_dir = os.path.join(str(local_parent_dir), "images") + local_md_dir = local_parent_dir + os.makedirs(local_image_dir, exist_ok=True) + os.makedirs(local_md_dir, exist_ok=True) + return local_image_dir, local_md_dir + + +def write_to_csv(csv_file_path, csv_data): + with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile: + # 创建csv writer对象 + csv_writer = csv.writer(csvfile) + # 写入数据 + csv_writer.writerow(csv_data) + logger.info(f"数据已成功追加到 '{csv_file_path}'") + + +def do_parse( + pdf_file_name, + pdf_bytes, + model_list, + parse_method, + f_draw_span_bbox=True, + f_draw_layout_bbox=True, + f_dump_md=True, + f_dump_middle_json=True, + f_dump_model_json=True, + f_dump_orig_pdf=True, + f_dump_content_list=True, + f_make_md_mode=MakeMode.MM_MD, +): + + orig_model_list = copy.deepcopy(model_list) + + local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method) + logger.info(f"local output dir is {local_md_dir}") + image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir) + image_dir = str(os.path.basename(local_image_dir)) + + if parse_method == "auto": + jso_useful_key = {"_pdf_type": "", "model_list": model_list} + pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True) + elif parse_method == "txt": + pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True) + elif parse_method == "ocr": + pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True) + else: + logger.error("unknown parse method") + exit(1) + + pipe.pipe_classify() + + """如果没有传入有效的模型数据,则使用内置model解析""" + if len(model_list) == 0: + if model_config.__use_inside_model__: + pipe.pipe_analyze() + orig_model_list = copy.deepcopy(pipe.model_list) + else: + logger.error("need model list input") + exit(1) + + pipe.pipe_parse() + pdf_info = pipe.pdf_mid_data["pdf_info"] + if f_draw_layout_bbox: + draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) + if f_draw_span_bbox: + draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) + + md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode) + if f_dump_md: + """写markdown""" + md_writer.write( + content=md_content, + path=f"{pdf_file_name}.md", + mode=AbsReaderWriter.MODE_TXT, + ) + + if f_dump_middle_json: + """写middle_json""" + md_writer.write( + content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), + path=f"{pdf_file_name}_middle.json", + mode=AbsReaderWriter.MODE_TXT, + ) + + if f_dump_model_json: + """写model_json""" + md_writer.write( + content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4), + path=f"{pdf_file_name}_model.json", + mode=AbsReaderWriter.MODE_TXT, + ) + + if f_dump_orig_pdf: + """写源pdf""" + md_writer.write( + content=pdf_bytes, + path=f"{pdf_file_name}_origin.pdf", + mode=AbsReaderWriter.MODE_BIN, + ) + + content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE) + if f_dump_content_list: + """写content_list""" + md_writer.write( + content=json_parse.dumps(content_list, ensure_ascii=False, indent=4), + path=f"{pdf_file_name}_content_list.json", + mode=AbsReaderWriter.MODE_TXT, + ) + + +@click.group() +@click.version_option(__version__, "--version", "-v", help="显示版本信息") +@click.help_option("--help", "-h", help="显示帮助信息") +def cli(): + pass + + +@cli.command() +@click.option("--json", type=str, help="输入一个S3路径") +@click.option( + "--method", + type=parse_pdf_methods, + help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法", + default="auto", +) +@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试") +@click.option("--model_mode", type=click.STRING, default="full", + help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢") +def json_command(json, method, inside_model, model_mode): + model_config.__use_inside_model__ = inside_model + model_config.__model_mode__ = model_mode + + if not json.startswith("s3://"): + logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path") + exit(1) + + def read_s3_path(s3path): + bucket, key = parse_s3path(s3path) + + s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket) + s3_rw = S3ReaderWriter( + s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path) + ) + may_range_params = parse_s3_range_params(s3path) + if may_range_params is None or 2 != len(may_range_params): + byte_start, byte_end = 0, None + else: + byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1]) + byte_end += byte_start - 1 + return s3_rw.read_jsonl( + remove_non_official_s3_args(s3path), + byte_start, + byte_end, + AbsReaderWriter.MODE_BIN, + ) + + jso = json_parse.loads(read_s3_path(json).decode("utf-8")) + s3_file_path = jso.get("file_location") + if s3_file_path is None: + s3_file_path = jso.get("path") + pdf_file_name = Path(s3_file_path).stem + pdf_data = read_s3_path(s3_file_path) + + do_parse( + pdf_file_name, + pdf_data, + jso["doc_layout_result"], + method, + ) + + +@cli.command() +@click.option("--local_json", type=str, help="输入一个本地jsonl路径") +@click.option( + "--method", + type=parse_pdf_methods, + help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法", + default="auto", +) +@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试") +@click.option("--model_mode", type=click.STRING, default="full", + help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢") +def local_json_command(local_json, method, inside_model, model_mode): + model_config.__use_inside_model__ = inside_model + model_config.__model_mode__ = model_mode + + def read_s3_path(s3path): + bucket, key = parse_s3path(s3path) + + s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket) + s3_rw = S3ReaderWriter( + s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path) + ) + may_range_params = parse_s3_range_params(s3path) + if may_range_params is None or 2 != len(may_range_params): + byte_start, byte_end = 0, None + else: + byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1]) + byte_end += byte_start - 1 + return s3_rw.read_jsonl( + remove_non_official_s3_args(s3path), + byte_start, + byte_end, + AbsReaderWriter.MODE_BIN, + ) + + with open(local_json, "r", encoding="utf-8") as f: + for json_line in f: + jso = json_parse.loads(json_line) + + s3_file_path = jso.get("file_location") + if s3_file_path is None: + s3_file_path = jso.get("path") + pdf_file_name = Path(s3_file_path).stem + pdf_data = read_s3_path(s3_file_path) + do_parse( + pdf_file_name, + pdf_data, + jso["doc_layout_result"], + method, + ) + + +@cli.command() +@click.option( + "--pdf", type=click.Path(exists=True), required=True, + help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径') +@click.option("--model", type=click.Path(exists=True), help="模型的路径") +@click.option( + "--method", + type=parse_pdf_methods, + help="指定解析方法。txt: 文本型 pdf 解析方法, ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法", + default="auto", +) +@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试") +@click.option("--model_mode", type=click.STRING, default="full", + help="内置模型选择。lite: 快速解析,精度较低,full: 高精度解析,速度较慢") +def pdf_command(pdf, model, method, inside_model, model_mode): + model_config.__use_inside_model__ = inside_model + model_config.__model_mode__ = model_mode + + def read_fn(path): + disk_rw = DiskReaderWriter(os.path.dirname(path)) + return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN) + + def get_model_json(model_path, doc_path): + # 这里处理pdf和模型相关的逻辑 + if model_path is None: + file_name_without_extension, extension = os.path.splitext(doc_path) + if extension == ".pdf": + model_path = file_name_without_extension + ".json" + else: + raise Exception("pdf_path input error") + if not os.path.exists(model_path): + logger.warning( + f"not found json {model_path} existed" + ) + # 本地无模型数据则调用内置paddle分析,先传空list,在内部识别到空list再调用paddle + model_json = "[]" + else: + model_json = read_fn(model_path).decode("utf-8") + else: + model_json = read_fn(model_path).decode("utf-8") + + return model_json + + def parse_doc(doc_path): + try: + file_name = str(Path(doc_path).stem) + pdf_data = read_fn(doc_path) + jso = json_parse.loads(get_model_json(model, doc_path)) + + do_parse( + file_name, + pdf_data, + jso, + method, + ) + + except Exception as e: + logger.exception(e) + + if not pdf: + logger.error(f"Error: Missing argument '--pdf'.") + exit(f"Error: Missing argument '--pdf'.") + else: + '''适配多个文档的list文件输入''' + if pdf.endswith(".list"): + with open(pdf, "r") as f: + for line in f.readlines(): + line = line.strip() + parse_doc(line) + else: + '''适配单个文档的输入''' + parse_doc(pdf) + + +if __name__ == "__main__": + """ + python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551 + """ + cli() diff --git a/magic_pdf/dict2md/__init__.py b/magic_pdf/dict2md/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/dict2md/mkcontent.py b/magic_pdf/dict2md/mkcontent.py new file mode 100644 index 0000000000000000000000000000000000000000..049e290e5433f4f19c8a43c7ddb634dcab149ffc --- /dev/null +++ b/magic_pdf/dict2md/mkcontent.py @@ -0,0 +1,397 @@ +import math +from loguru import logger + +from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox +from magic_pdf.libs.commons import join_path +from magic_pdf.libs.ocr_content_type import ContentType + +TYPE_INLINE_EQUATION = ContentType.InlineEquation +TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation +UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + + +@DeprecationWarning +def mk_nlp_markdown_1(para_dict: dict): + """ + 对排序后的bboxes拼接内容 + """ + content_lst = [] + for _, page_info in para_dict.items(): + para_blocks = page_info.get("para_blocks") + if not para_blocks: + continue + + for block in para_blocks: + item = block["paras"] + for _, p in item.items(): + para_text = p["para_text"] + is_title = p["is_para_title"] + title_level = p['para_title_level'] + md_title_prefix = "#"*title_level + if is_title: + content_lst.append(f"{md_title_prefix} {para_text}") + else: + content_lst.append(para_text) + + content_text = "\n\n".join(content_lst) + + return content_text + + + +# 找到目标字符串在段落中的索引 +def __find_index(paragraph, target): + index = paragraph.find(target) + if index != -1: + return index + else: + return None + + +def __insert_string(paragraph, target, postion): + new_paragraph = paragraph[:postion] + target + paragraph[postion:] + return new_paragraph + + +def __insert_after(content, image_content, target): + """ + 在content中找到target,将image_content插入到target后面 + """ + index = content.find(target) + if index != -1: + content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):] + else: + logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}") + return content + +def __insert_before(content, image_content, target): + """ + 在content中找到target,将image_content插入到target前面 + """ + index = content.find(target) + if index != -1: + content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:] + else: + logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}") + return content + + +@DeprecationWarning +def mk_mm_markdown_1(para_dict: dict): + """拼装多模态markdown""" + content_lst = [] + for _, page_info in para_dict.items(): + page_lst = [] # 一个page内的段落列表 + para_blocks = page_info.get("para_blocks") + pymu_raw_blocks = page_info.get("preproc_blocks") + + all_page_images = [] + all_page_images.extend(page_info.get("images",[])) + all_page_images.extend(page_info.get("image_backup", []) ) + all_page_images.extend(page_info.get("tables",[])) + all_page_images.extend(page_info.get("table_backup",[]) ) + + if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景 + for img in all_page_images: + page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序 + page_md = "\n\n".join(page_lst) + + else: + for block in para_blocks: + item = block["paras"] + for _, p in item.items(): + para_text = p["para_text"] + is_title = p["is_para_title"] + title_level = p['para_title_level'] + md_title_prefix = "#"*title_level + if is_title: + page_lst.append(f"{md_title_prefix} {para_text}") + else: + page_lst.append(para_text) + + """拼装成一个页面的文本""" + page_md = "\n\n".join(page_lst) + """插入图片""" + for img in all_page_images: + imgbox = img['bbox'] + img_content = f"![]({img['image_path']})" + # 先看在哪个block内 + for block in pymu_raw_blocks: + bbox = block['bbox'] + if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内 + for l in block['lines']: + line_box = l['bbox'] + if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的,插入line前面 + line_txt = "".join([s['text'] for s in l['spans']]) + page_md = __insert_before(page_md, img_content, line_txt) + break + break + else:# 在行与行之间 + # 找到图片x0,y0与line的x0,y0最近的line + min_distance = 100000 + min_line = None + for l in block['lines']: + line_box = l['bbox'] + distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2) + if distance < min_distance: + min_distance = distance + min_line = l + if min_line: + line_txt = "".join([s['text'] for s in min_line['spans']]) + img_h = imgbox[3] - imgbox[1] + if min_distance 15: + words[j] = ' '.join(wordninja.split(words[j])) + segments[i] = ''.join(words) + return ' '.join(segments) + + +def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path): + markdown = [] + for page_info in pdf_info_list: + paras_of_layout = page_info.get("para_blocks") + page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path) + markdown.extend(page_markdown) + return '\n\n'.join(markdown) + + +def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list): + markdown = [] + for page_info in pdf_info_dict: + paras_of_layout = page_info.get("para_blocks") + page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp") + markdown.extend(page_markdown) + return '\n\n'.join(markdown) + + +def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path): + markdown_with_para_and_pagination = [] + page_no = 0 + for page_info in pdf_info_dict: + paras_of_layout = page_info.get("para_blocks") + if not paras_of_layout: + continue + page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path) + markdown_with_para_and_pagination.append({ + 'page_no': page_no, + 'md_content': '\n\n'.join(page_markdown) + }) + page_no += 1 + return markdown_with_para_and_pagination + + +def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""): + page_markdown = [] + for paras in paras_of_layout: + for para in paras: + para_text = '' + for line in para: + for span in line['spans']: + span_type = span.get('type') + content = '' + language = '' + if span_type == ContentType.Text: + content = span['content'] + language = detect_lang(content) + if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 + content = ocr_escape_special_markdown_char(split_long_words(content)) + else: + content = ocr_escape_special_markdown_char(content) + elif span_type == ContentType.InlineEquation: + content = f"${span['content']}$" + elif span_type == ContentType.InterlineEquation: + content = f"\n$$\n{span['content']}\n$$\n" + elif span_type in [ContentType.Image, ContentType.Table]: + if mode == 'mm': + content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n" + elif mode == 'nlp': + pass + if content != '': + if language == 'en': # 英文语境下 content间需要空格分隔 + para_text += content + ' ' + else: # 中文语境下,content间不需要空格分隔 + para_text += content + if para_text.strip() == '': + continue + else: + page_markdown.append(para_text.strip() + ' ') + return page_markdown + + +def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""): + page_markdown = [] + for para_block in paras_of_layout: + para_text = '' + para_type = para_block['type'] + if para_type == BlockType.Text: + para_text = merge_para_with_text(para_block) + elif para_type == BlockType.Title: + para_text = f"# {merge_para_with_text(para_block)}" + elif para_type == BlockType.InterlineEquation: + para_text = merge_para_with_text(para_block) + elif para_type == BlockType.Image: + if mode == 'nlp': + continue + elif mode == 'mm': + for block in para_block['blocks']: # 1st.拼image_body + if block['type'] == BlockType.ImageBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Image: + para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" + for block in para_block['blocks']: # 2nd.拼image_caption + if block['type'] == BlockType.ImageCaption: + para_text += merge_para_with_text(block) + elif para_type == BlockType.Table: + if mode == 'nlp': + continue + elif mode == 'mm': + for block in para_block['blocks']: # 1st.拼table_caption + if block['type'] == BlockType.TableCaption: + para_text += merge_para_with_text(block) + for block in para_block['blocks']: # 2nd.拼table_body + if block['type'] == BlockType.TableBody: + for line in block['lines']: + for span in line['spans']: + if span['type'] == ContentType.Table: + para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" + for block in para_block['blocks']: # 3rd.拼table_footnote + if block['type'] == BlockType.TableFootnote: + para_text += merge_para_with_text(block) + + if para_text.strip() == '': + continue + else: + page_markdown.append(para_text.strip() + ' ') + + return page_markdown + + +def merge_para_with_text(para_block): + para_text = '' + for line in para_block['lines']: + line_text = "" + line_lang = "" + for span in line['spans']: + span_type = span['type'] + if span_type == ContentType.Text: + line_text += span['content'].strip() + if line_text != "": + line_lang = detect_lang(line_text) + for span in line['spans']: + span_type = span['type'] + content = '' + if span_type == ContentType.Text: + content = span['content'] + language = detect_lang(content) + if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 + content = ocr_escape_special_markdown_char(split_long_words(content)) + else: + content = ocr_escape_special_markdown_char(content) + elif span_type == ContentType.InlineEquation: + content = f"${span['content']}$" + elif span_type == ContentType.InterlineEquation: + content = f"\n$$\n{span['content']}\n$$\n" + + if content != '': + if 'zh' in line_lang: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 + para_text += content # 中文语境下,content间不需要空格分隔 + else: + para_text += content + ' ' # 英文语境下 content间需要空格分隔 + return para_text + + +def para_to_standard_format(para, img_buket_path): + para_content = {} + if len(para) == 1: + para_content = line_to_standard_format(para[0], img_buket_path) + elif len(para) > 1: + para_text = '' + inline_equation_num = 0 + for line in para: + for span in line['spans']: + language = '' + span_type = span.get('type') + content = "" + if span_type == ContentType.Text: + content = span['content'] + language = detect_lang(content) + if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 + content = ocr_escape_special_markdown_char(split_long_words(content)) + else: + content = ocr_escape_special_markdown_char(content) + elif span_type == ContentType.InlineEquation: + content = f"${span['content']}$" + inline_equation_num += 1 + + if language == 'en': # 英文语境下 content间需要空格分隔 + para_text += content + ' ' + else: # 中文语境下,content间不需要空格分隔 + para_text += content + para_content = { + 'type': 'text', + 'text': para_text, + 'inline_equation_num': inline_equation_num + } + return para_content + + +def para_to_standard_format_v2(para_block, img_buket_path): + para_type = para_block['type'] + if para_type == BlockType.Text: + para_content = { + 'type': 'text', + 'text': merge_para_with_text(para_block), + } + elif para_type == BlockType.Title: + para_content = { + 'type': 'text', + 'text': merge_para_with_text(para_block), + 'text_level': 1 + } + elif para_type == BlockType.InterlineEquation: + para_content = { + 'type': 'equation', + 'text': merge_para_with_text(para_block), + 'text_format': "latex" + } + elif para_type == BlockType.Image: + para_content = { + 'type': 'image', + } + for block in para_block['blocks']: + if block['type'] == BlockType.ImageBody: + para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path']) + if block['type'] == BlockType.ImageCaption: + para_content['img_caption'] = merge_para_with_text(block) + elif para_type == BlockType.Table: + para_content = { + 'type': 'table', + } + for block in para_block['blocks']: + if block['type'] == BlockType.TableBody: + para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path']) + if block['type'] == BlockType.TableCaption: + para_content['table_caption'] = merge_para_with_text(block) + if block['type'] == BlockType.TableFootnote: + para_content['table_footnote'] = merge_para_with_text(block) + + return para_content + + +def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str): + content_list = [] + for page_info in pdf_info_dict: + paras_of_layout = page_info.get("para_blocks") + if not paras_of_layout: + continue + for para_block in paras_of_layout: + para_content = para_to_standard_format_v2(para_block, img_buket_path) + content_list.append(para_content) + return content_list + + +def line_to_standard_format(line, img_buket_path): + line_text = "" + inline_equation_num = 0 + for span in line['spans']: + if not span.get('content'): + if not span.get('image_path'): + continue + else: + if span['type'] == ContentType.Image: + content = { + 'type': 'image', + 'img_path': join_path(img_buket_path, span['image_path']) + } + return content + elif span['type'] == ContentType.Table: + content = { + 'type': 'table', + 'img_path': join_path(img_buket_path, span['image_path']) + } + return content + else: + if span['type'] == ContentType.InterlineEquation: + interline_equation = span['content'] + content = { + 'type': 'equation', + 'latex': f"$$\n{interline_equation}\n$$" + } + return content + elif span['type'] == ContentType.InlineEquation: + inline_equation = span['content'] + line_text += f"${inline_equation}$" + inline_equation_num += 1 + elif span['type'] == ContentType.Text: + text_content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号 + line_text += text_content + content = { + 'type': 'text', + 'text': line_text, + 'inline_equation_num': inline_equation_num + } + return content + + +def ocr_mk_mm_standard_format(pdf_info_dict: list): + """ + content_list + type string image/text/table/equation(行间的单独拿出来,行内的和text合并) + latex string latex文本字段。 + text string 纯文本格式的文本数据。 + md string markdown格式的文本数据。 + img_path string s3://full/path/to/img.jpg + """ + content_list = [] + for page_info in pdf_info_dict: + blocks = page_info.get("preproc_blocks") + if not blocks: + continue + for block in blocks: + for line in block['lines']: + content = line_to_standard_format(line) + content_list.append(content) + return content_list + + +def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""): + output_content = [] + for page_info in pdf_info_dict: + if page_info.get("need_drop", False): + drop_reason = page_info.get("drop_reason") + if drop_mode == DropMode.NONE: + pass + elif drop_mode == DropMode.WHOLE_PDF: + raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}") + elif drop_mode == DropMode.SINGLE_PAGE: + logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}") + continue + else: + raise Exception(f"drop_mode can not be null") + + paras_of_layout = page_info.get("para_blocks") + if not paras_of_layout: + continue + if make_mode == MakeMode.MM_MD: + page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path) + output_content.extend(page_markdown) + elif make_mode == MakeMode.NLP_MD: + page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp") + output_content.extend(page_markdown) + elif make_mode == MakeMode.STANDARD_FORMAT: + for para_block in paras_of_layout: + para_content = para_to_standard_format_v2(para_block, img_buket_path) + output_content.append(para_content) + if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]: + return '\n\n'.join(output_content) + elif make_mode == MakeMode.STANDARD_FORMAT: + return output_content diff --git a/magic_pdf/filter/__init__.py b/magic_pdf/filter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/filter/pdf_classify_by_type.py b/magic_pdf/filter/pdf_classify_by_type.py new file mode 100644 index 0000000000000000000000000000000000000000..319fb3fb20273187507fa66a10cdc0d61e99d500 --- /dev/null +++ b/magic_pdf/filter/pdf_classify_by_type.py @@ -0,0 +1,393 @@ +""" +根据利用meta_scan得到的结果,对pdf是否为文字版进行分类。 +定义标准: +一、什么pdf会是文字pdf,只要满足以下任意一条 + 1. 随机抽取N页,如果有任何一页文字数目大于100 + 2. 只要存在一个页面,图片的数量为0 +二、什么是扫描版pdf,只要满足以下任意一条 + 1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~ + 2. 大部分页面上文字的长度都是相等的。 + +""" +import json +import sys +from collections import Counter + +import click +import numpy as np +from loguru import logger + +from magic_pdf.libs.commons import mymax, get_top_percent_list +from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min + +TEXT_LEN_THRESHOLD = 100 +AVG_TEXT_LEN_THRESHOLD = 100 +TEXT_LEN_SAMPLE_RATIO = 0.1 # 抽取0.1的页面进行文字长度统计 + + +# 一个拼接图片的方案,将某些特殊扫描版本的拆图拼成一张整图 +def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2): + # 先通过set去除所有bbox重叠的图片数据 + image_list_result = [] + for page_images in image_list: + page_result = [] + dedup = set() + for img in page_images: + x0, y0, x1, y1, img_bojid = img + if (x0, y0, x1, y1) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉 + continue + else: + dedup.add((x0, y0, x1, y1)) + page_result.append([x0, y0, x1, y1, img_bojid]) + image_list_result.append(page_result) + + # 接下来,将同一页可拼接的图片进行合并 + merged_images = [] + for page_images in image_list_result: + if not page_images: + continue + + # 先将同一页的图片从上到下,从左到右进行排序 + page_images.sort(key=lambda img: (img[1], img[0])) + + merged = [page_images[0]] + + for img in page_images[1:]: + x0, y0, x1, y1, imgid = img + + last_img = merged[-1] + last_x0, last_y0, last_x1, last_y1, last_imgid = last_img + + # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件 + full_width = abs(x1 - x0) >= page_width * 0.9 + full_height = abs(y1 - y0) >= page_height * 0.9 + + # 如果宽达标,检测是否能竖着拼 + if full_width: + # 竖着拼需要满足两个前提,左右边界各偏移不能超过 max_offset,第一张图的下边界和第二张图的上边界偏移不能超过 max_gap + close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= ( + last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap) + + # 如果高达标,检测是否可以横着拼 + if full_height: + # 横着拼需要满足两个前提,上下边界各偏移不能超过 max_offset,第一张图的右边界和第二张图的左边界偏移不能超过 max_gap + close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= ( + last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap) + + # Check if the image can be merged with the last image + if (full_width and close1) or (full_height and close2): + # Merge the image with the last image + merged[-1] = [min(x0, last_x0), min(y0, last_y0), + max(x1, last_x1), max(y1, last_y1), imgid] + else: + # Add the image as a new image + merged.append(img) + + merged_images.append(merged) + + return merged_images + + +def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list): + """ + 80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False,否则返回True + :param pdf_path: + :param total_page: + :param page_width: + :param page_height: + :param img_sz_list: + :return: + """ + # # 只要有一页没有图片,那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf,上面有一些空白页面,既没有图片也没有文字。 + # if any([len(img_sz) == 0 for img_sz in img_sz_list]): # 含有不含图片的页面 + # # 现在找到这些页面的index + # empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0] + # # 然后检查这些页面上是否有文字 + # text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0] + # if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD: # 没有图片,但是有文字,说明可能是个文字版,如果没有文字则无法判断,留给下一步,现在要求这页文字量超过一定阈值 + # return True + + # 通过objid去掉重复出现10次以上的图片,这些图片是隐藏的透明图层,其特点是id都一样 + # 先对每个id出现的次数做个统计 + objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz]) + # 再去掉出现次数大于10的 + if total_page >= scan_max_page: # 新的meta_scan只扫描前 scan_max_page 页,页数大于 scan_max_page 当total_page为 scan_max_page + total_page = scan_max_page + + repeat_threshold = 2 # 把bad_image的阈值设为2 + # repeat_threshold = min(2, total_page) # 当total_page为1时,repeat_threshold为1,会产生误判导致所有img变成bad_img + bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold]) + # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])] + # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0] + + # 特殊情况,一个文字版pdf,每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上 + # fake_image_ids = [objid for objid in bad_image_objid if + # any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for + # x0, y0, x1, y1, _ in images])] # 原来的代码,any里面恒为true了,原因??? + # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images + # if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9] + + # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]): # 这些透明图片所在的页面上有文字大于阈值 + # return True + + img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in + img_sz_list] # 过滤掉重复出现的图片 + + # 有的扫描版会把一页图片拆成很多张,需要先把图拼起来再计算 + img_sz_list = merge_images(img_sz_list, page_width, page_height) + + # 计算每个页面上最大的图的面积,然后计算这个面积占页面面积的比例 + max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in + img_sz_list] + page_area = page_width * page_height + max_image_area_per_page = [area / page_area for area in max_image_area_per_page] + max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5] + + if len(max_image_area_per_page) >= 0.5 * total_page: # 阈值从0.8改到0.5,适配3页里面有两页和两页里面有一页的情况 + # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层,其特点是id都一样 + return False + else: + return True + + +def classify_by_text_len(text_len_list: list, total_page: int): + """ + 随机抽取10%的页面,如果少于5个页面,那么就取全部页面。 + 查看页面上的文字长度,如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD,那么就是文字pdf + :param total_page: + :param text_len_list: + :return: + """ + select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO) # 选取10%的页面 + if select_page_cnt < 5: + select_page_cnt = total_page + + # # 排除头尾各10页 + # if total_page > 20: # 如果总页数大于20 + # page_range = list(range(10, total_page - 10)) # 从第11页到倒数第11页 + # else: + # page_range = list(range(total_page)) # 否则选择所有页面 + # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False) + # 排除前后10页对只有21,22页的pdf很尴尬,如果选出来的中间那一两页恰好没字容易误判,有了avg_words规则,这个规则可以忽略 + page_num = np.random.choice(total_page, select_page_cnt, replace=False) + text_len_lst = [text_len_list[i] for i in page_num] + is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst]) + return is_text_pdf + + +def classify_by_avg_words(text_len_list: list): + """ + 补充规则,如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD,就不是文字pdf + 主要是各种图集 + :param text_len_list: + :return: + """ + sum_words = sum(text_len_list) + count_of_numbers = len(text_len_list) + if count_of_numbers == 0: + is_text_pdf = False + else: + avg_words = round(sum_words / count_of_numbers) + if avg_words > AVG_TEXT_LEN_THRESHOLD: + is_text_pdf = True + else: + is_text_pdf = False + + return is_text_pdf + + +def classify_by_img_num(img_sz_list: list, img_num_list: list): + """ + 补充规则,有一种扫描版本的PDF,每一页都会放所有的扫描页进去,在 metascan 时会被去重, + 这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素,img_num_list中每一页的数量都很大且相同 + :param img_sz_list: + :param img_num_list: + :return: + """ + # 计算img_sz_list中非空元素的个数 + count_img_sz_list_not_none = sum(1 for item in img_sz_list if item) + # 获取前80%的元素 + top_eighty_percent = get_top_percent_list(img_num_list, 0.8) + # img_sz_list中非空元素的个数小于1,前80%的元素都相等,且最大值大于等于junk_limit_min + if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min: + + #拿max和min的值,用来判断list内的值是否全都相等 + # min_imgs = min(img_num_list) + # max_imgs = max(img_num_list) + # + # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min: + return False # 如果满足这个条件,一定不是文字版pdf + else: + return True # 不满足这三个条件,可能是文字版pdf,通过其他规则判断 + + +def classify_by_text_layout(text_layout_per_page: list): + """ + 判断文本布局是否以竖排为主。 + + Args: + text_layout_per_page (list): 文本布局列表,列表中的每个元素表示一页的文本布局, + 值为'vertical'表示竖排,值为'horizontal'表示横排。 + + Returns: + bool: 若文本布局以竖排为主,则返回False;否则返回True。 + """ + # 统计text_layout_per_page中竖排的个数 + count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical') + # 统计text_layout_per_page中横排的个数 + count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal') + # 计算text_layout_per_page中竖排的占比 + known_layout_cnt = count_vertical + count_horizontal + if known_layout_cnt != 0: + ratio = count_vertical / known_layout_cnt + if ratio >= 0.5: # 阈值设为0.5,适配3页里面有2页和两页里有一页的情况 + return False # 文本布局以竖排为主,认为不是文字版pdf + else: + return True # 文本布局以横排为主,认为是文字版pdf + else: + return False # 文本布局未知,默认认为不是文字版pdf + + +def classify_by_img_narrow_strips(page_width, page_height, img_sz_list): + """ + 判断一页是否由细长条组成,有两个条件: + 1. 图片的宽或高达到页面宽或高的90%,且长边需要是窄边长度的数倍以上 + 2. 整个页面所有的图片有80%以上满足条件1 + + Args: + page_width (float): 页面宽度 + page_height (float): 页面高度 + img_sz_list (list): 图片尺寸列表,每个元素为一个元组,表示图片的矩形区域和尺寸,形如(x0, y0, x1, y1, size),其中(x0, y0)为矩形区域的左上角坐标,(x1, y1)为矩形区域的右下角坐标,size为图片的尺寸 + + Returns: + bool: 如果满足条件的页面的比例小于0.5,返回True,否则返回False + """ + + def is_narrow_strip(img): + x0, y0, x1, y1, _ = img + width, height = x1 - x0, y1 - y0 + return any([ + # 图片宽度大于等于页面宽度的90%,且宽度大于等于高度4倍 + width >= page_width * 0.9 and width >= height * 4, + # 图片高度大于等于页面高度的90%,且高度大于等于宽度4倍 + height >= page_height * 0.9 and height >= width * 4, + ]) + + # 初始化满足条件的页面数量 + narrow_strip_pages_count = 0 + + # 遍历所有页面 + for page_img_list in img_sz_list: + # 忽略空页面 + if not page_img_list: + continue + + # 计算页面中的图片总数 + total_images = len(page_img_list) + + # 计算页面中细长条图片的数量 + narrow_strip_images_count = 0 + for img in page_img_list: + if is_narrow_strip(img): + narrow_strip_images_count += 1 + # 如果细长条图片的数量少于5,跳过 + if narrow_strip_images_count < 5: + continue + else: + # 如果细长条图片的比例大于或等于0.8,增加满足条件的页面数量 + if narrow_strip_images_count / total_images >= 0.8: + narrow_strip_pages_count += 1 + + # 计算满足条件的页面的比例 + narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list) + + return narrow_strip_pages_ratio < 0.5 + + +def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list, + text_layout_list: list, invalid_chars: bool): + """ + 这里的图片和页面长度单位是pts + :param total_page: + :param text_len_list: + :param page_width: + :param page_height: + :param img_sz_list: + :param pdf_path: + :return: + """ + results = { + 'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list), + 'by_text_len': classify_by_text_len(text_len_list, total_page), + 'by_avg_words': classify_by_avg_words(text_len_list), + 'by_img_num': classify_by_img_num(img_sz_list, img_num_list), + 'by_text_layout': classify_by_text_layout(text_layout_list), + 'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list), + 'by_invalid_chars': invalid_chars, + } + + if all(results.values()): + return True, results + elif not any(results.values()): + return False, results + else: + logger.warning( + f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']}," + f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']}," + f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']}," + f" by_invalid_chars: {results['by_invalid_chars']}", + file=sys.stderr) # 利用这种情况可以快速找出来哪些pdf比较特殊,针对性修正分类算法 + return False, results + + +@click.command() +@click.option("--json-file", type=str, help="pdf信息") +def main(json_file): + if json_file is None: + print("json_file is None", file=sys.stderr) + exit(0) + try: + with open(json_file, "r") as f: + for l in f: + if l.strip() == "": + continue + o = json.loads(l) + total_page = o["total_page"] + page_width = o["page_width_pts"] + page_height = o["page_height_pts"] + img_sz_list = o["image_info_per_page"] + text_len_list = o['text_len_per_page'] + text_layout_list = o['text_layout_per_page'] + pdf_path = o['pdf_path'] + is_encrypted = o['is_encrypted'] + is_needs_password = o['is_needs_password'] + if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 + continue + tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list) + o['is_text_pdf'] = tag + print(json.dumps(o, ensure_ascii=False)) + except Exception as e: + print("ERROR: ", e, file=sys.stderr) + + +if __name__ == "__main__": + main() + # false = False + # true = True + # null = None + # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}} + # o = json.loads(json.dumps(o)) + # total_page = o["total_page"] + # page_width = o["page_width_pts"] + # page_height = o["page_height_pts"] + # img_sz_list = o["image_info_per_page"] + # text_len_list = o['text_len_per_page'] + # pdf_path = o['pdf_path'] + # is_encrypted = o['is_encrypted'] + # is_needs_password = o['is_needs_password'] + # if is_encrypted or total_page == 0 or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 + # print("加密的") + # exit(0) + # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list) + # o['is_text_pdf'] = tag + # print(json.dumps(o, ensure_ascii=False)) diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py new file mode 100644 index 0000000000000000000000000000000000000000..89d44878d313a1a72f3bb311c812195e546d2a76 --- /dev/null +++ b/magic_pdf/filter/pdf_meta_scan.py @@ -0,0 +1,388 @@ +""" +输入: s3路径,每行一个 +输出: pdf文件元信息,包括每一页上的所有图片的长宽高,bbox位置 +""" +import sys +import click + +from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list +from magic_pdf.libs.commons import fitz +from loguru import logger +from collections import Counter + +from magic_pdf.libs.drop_reason import DropReason +from magic_pdf.libs.language import detect_lang +from magic_pdf.libs.pdf_check import detect_invalid_chars + +scan_max_page = 50 +junk_limit_min = 10 + + +def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts): + max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in + result] + page_area = int(page_width_pts) * int(page_height_pts) + max_image_area_per_page = [area / page_area for area in max_image_area_per_page] + max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6] + return max_image_area_per_page + + +def process_image(page, junk_img_bojids=[]): + page_result = [] # 存每个页面里的多张图四元组信息 + items = page.get_images() + dedup = set() + for img in items: + # 这里返回的是图片在page上的实际展示的大小。返回一个数组,每个元素第一部分是 + img_bojid = img[0] # 在pdf文件中是全局唯一的,如果这个图反复出现在pdf里那么就可能是垃圾信息,例如水印、页眉页脚等 + if img_bojid in junk_img_bojids: # 如果是垃圾图像,就跳过 + continue + recs = page.get_image_rects(img, transform=True) + if recs: + rec = recs[0][0] + x0, y0, x1, y1 = map(int, rec) + width = x1 - x0 + height = y1 - y0 + if (x0, y0, x1, y1, img_bojid) in dedup: # 这里面会出现一些重复的bbox,无需重复出现,需要去掉 + continue + if not all([width, height]): # 长和宽任何一个都不能是0,否则这个图片不可见,没有实际意义 + continue + dedup.add((x0, y0, x1, y1, img_bojid)) + page_result.append([x0, y0, x1, y1, img_bojid]) + return page_result + + +def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list: + """ + 返回每个页面里的图片的四元组,每个页面多个图片。 + :param doc: + :return: + """ + # 使用 Counter 计数 img_bojid 的出现次数 + img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images()) + # 找出出现次数超过 len(doc) 半数的 img_bojid + + junk_limit = max(len(doc) * 0.5, junk_limit_min) # 对一些页数比较少的进行豁免 + + junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit] + + #todo 加个判断,用前十页就行,这些垃圾图片需要满足两个条件,不止出现的次数要足够多,而且图片占书页面积的比例要足够大,且图与图大小都差不多 + #有两种扫描版,一种文字版,这里可能会有误判 + #扫描版1:每页都有所有扫描页图片,特点是图占比大,每页展示1张 + #扫描版2,每页存储的扫描页图片数量递增,特点是图占比大,每页展示1张,需要清空junklist跑前50页图片信息用于分类判断 + #文字版1.每页存储所有图片,特点是图片占页面比例不大,每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数,如果符合需要清空junklist + imgs_len_list = [len(page.get_images()) for page in doc] + + special_limit_pages = 10 + + # 统一用前十页结果做判断 + result = [] + break_loop = False + for i, page in enumerate(doc): + if break_loop: + break + if i >= special_limit_pages: + break + page_result = process_image(page) # 这里不传junk_img_bojids,拿前十页所有图片信息用于后续分析 + result.append(page_result) + for item in result: + if not any(item): # 如果任何一页没有图片,说明是个文字版,需要判断是否为特殊文字版 + if max(imgs_len_list) == min(imgs_len_list) and max( + imgs_len_list) >= junk_limit_min: # 如果是特殊文字版,就把junklist置空并break + junk_img_bojids = [] + else: # 不是特殊文字版,是个普通文字版,但是存在垃圾图片,不置空junklist + pass + break_loop = True + break + if not break_loop: + # 获取前80%的元素 + top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8) + # 检查前80%的元素是否都相等 + if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min: + + # # 如果前10页跑完都有图,根据每页图片数量是否相等判断是否需要清除junklist + # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min: + + #前10页都有图,且每页数量一致,需要检测图片大小占页面的比例判断是否需要清除junklist + max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts) + if len(max_image_area_per_page) < 0.8 * special_limit_pages: # 前10页不全是大图,说明可能是个文字版pdf,把垃圾图片list置空 + junk_img_bojids = [] + else: # 前10页都有图,而且80%都是大图,且每页图片数量一致并都很多,说明是扫描版1,不需要清空junklist + pass + else: # 每页图片数量不一致,需要清掉junklist全量跑前50页图片 + junk_img_bojids = [] + + #正式进入取前50页图片的信息流程 + result = [] + for i, page in enumerate(doc): + if i >= scan_max_page: + break + page_result = process_image(page, junk_img_bojids) + # logger.info(f"page {i} img_len: {len(page_result)}") + result.append(page_result) + + return result, junk_img_bojids + + +def get_pdf_page_size_pts(doc: fitz.Document): + page_cnt = len(doc) + l: int = min(page_cnt, 50) + #把所有宽度和高度塞到两个list 分别取中位数(中间遇到了个在纵页里塞横页的pdf,导致宽高互换了) + page_width_list = [] + page_height_list = [] + for i in range(l): + page = doc[i] + page_rect = page.rect + page_width_list.append(page_rect.width) + page_height_list.append(page_rect.height) + + page_width_list.sort() + page_height_list.sort() + + median_width = page_width_list[len(page_width_list) // 2] + median_height = page_height_list[len(page_height_list) // 2] + + return median_width, median_height + + +def get_pdf_textlen_per_page(doc: fitz.Document): + text_len_lst = [] + for page in doc: + # 拿包含img和text的所有blocks + # text_block = page.get_text("blocks") + # 拿所有text的blocks + # text_block = page.get_text("words") + # text_block_len = sum([len(t[4]) for t in text_block]) + #拿所有text的str + text_block = page.get_text("text") + text_block_len = len(text_block) + # logger.info(f"page {page.number} text_block_len: {text_block_len}") + text_len_lst.append(text_block_len) + + return text_len_lst + + +def get_pdf_text_layout_per_page(doc: fitz.Document): + """ + 根据PDF文档的每一页文本布局,判断该页的文本布局是横向、纵向还是未知。 + + Args: + doc (fitz.Document): PDF文档对象。 + + Returns: + List[str]: 每一页的文本布局(横向、纵向、未知)。 + + """ + text_layout_list = [] + + for page_id, page in enumerate(doc): + if page_id >= scan_max_page: + break + # 创建每一页的纵向和横向的文本行数计数器 + vertical_count = 0 + horizontal_count = 0 + text_dict = page.get_text("dict") + if "blocks" in text_dict: + for block in text_dict["blocks"]: + if 'lines' in block: + for line in block["lines"]: + # 获取line的bbox顶点坐标 + x0, y0, x1, y1 = line['bbox'] + # 计算bbox的宽高 + width = x1 - x0 + height = y1 - y0 + # 计算bbox的面积 + area = width * height + font_sizes = [] + for span in line['spans']: + if 'size' in span: + font_sizes.append(span['size']) + if len(font_sizes) > 0: + average_font_size = sum(font_sizes) / len(font_sizes) + else: + average_font_size = 10 # 有的line拿不到font_size,先定一个阈值100 + if area <= average_font_size ** 2: # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向 + continue + else: + if 'wmode' in line: # 通过wmode判断文本方向 + if line['wmode'] == 1: # 判断是否为竖向文本 + vertical_count += 1 + elif line['wmode'] == 0: # 判断是否为横向文本 + horizontal_count += 1 + # if 'dir' in line: # 通过旋转角度计算判断文本方向 + # # 获取行的 "dir" 值 + # dir_value = line['dir'] + # cosine, sine = dir_value + # # 计算角度 + # angle = math.degrees(math.acos(cosine)) + # + # # 判断是否为横向文本 + # if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01: + # # line_text = ' '.join(span['text'] for span in line['spans']) + # # print('This line is horizontal:', line_text) + # horizontal_count += 1 + # # 判断是否为纵向文本 + # elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01: + # # line_text = ' '.join(span['text'] for span in line['spans']) + # # print('This line is vertical:', line_text) + # vertical_count += 1 + # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}") + # 判断每一页的文本布局 + if vertical_count == 0 and horizontal_count == 0: # 该页没有文本,无法判断 + text_layout_list.append("unknow") + continue + else: + if vertical_count > horizontal_count: # 该页的文本纵向行数大于横向的 + text_layout_list.append("vertical") + else: # 该页的文本横向行数大于纵向的 + text_layout_list.append("horizontal") + # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}") + return text_layout_list + + +'''定义一个自定义异常用来抛出单页svg太多的pdf''' + + +class PageSvgsTooManyError(Exception): + def __init__(self, message="Page SVGs are too many"): + self.message = message + super().__init__(self.message) + + +def get_svgs_per_page(doc: fitz.Document): + svgs_len_list = [] + for page_id, page in enumerate(doc): + # svgs = page.get_drawings() + svgs = page.get_cdrawings() # 切换成get_cdrawings,效率更高 + len_svgs = len(svgs) + if len_svgs >= 3000: + raise PageSvgsTooManyError() + else: + svgs_len_list.append(len_svgs) + # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}") + return svgs_len_list + + +def get_imgs_per_page(doc: fitz.Document): + imgs_len_list = [] + for page_id, page in enumerate(doc): + imgs = page.get_images() + imgs_len_list.append(len(imgs)) + # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}") + + return imgs_len_list + + +def get_language(doc: fitz.Document): + """ + 获取PDF文档的语言。 + Args: + doc (fitz.Document): PDF文档对象。 + Returns: + str: 文档语言,如 "en-US"。 + """ + language_lst = [] + for page_id, page in enumerate(doc): + if page_id >= scan_max_page: + break + # 拿所有text的str + text_block = page.get_text("text") + page_language = detect_lang(text_block) + language_lst.append(page_language) + + # logger.info(f"page_id: {page_id}, page_language: {page_language}") + + # 统计text_language_list中每种语言的个数 + count_dict = Counter(language_lst) + # 输出text_language_list中出现的次数最多的语言 + language = max(count_dict, key=count_dict.get) + return language + + +def check_invalid_chars(pdf_bytes): + """ + 乱码检测 + """ + return detect_invalid_chars(pdf_bytes) + + +def pdf_meta_scan(pdf_bytes: bytes): + """ + :param s3_pdf_path: + :param pdf_bytes: pdf文件的二进制数据 + 几个维度来评价:是否加密,是否需要密码,纸张大小,总页数,是否文字可提取 + """ + doc = fitz.open("pdf", pdf_bytes) + is_needs_password = doc.needs_pass + is_encrypted = doc.is_encrypted + total_page = len(doc) + if total_page == 0: + logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}") + result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF} + return result + else: + page_width_pts, page_height_pts = get_pdf_page_size_pts(doc) + # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}") + + # svgs_per_page = get_svgs_per_page(doc) + # logger.info(f"svgs_per_page: {svgs_per_page}") + imgs_per_page = get_imgs_per_page(doc) + # logger.info(f"imgs_per_page: {imgs_per_page}") + + image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts) + # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}") + text_len_per_page = get_pdf_textlen_per_page(doc) + # logger.info(f"text_len_per_page: {text_len_per_page}") + text_layout_per_page = get_pdf_text_layout_per_page(doc) + # logger.info(f"text_layout_per_page: {text_layout_per_page}") + text_language = get_language(doc) + # logger.info(f"text_language: {text_language}") + invalid_chars = check_invalid_chars(pdf_bytes) + # logger.info(f"invalid_chars: {invalid_chars}") + + # 最后输出一条json + res = { + "is_needs_password": is_needs_password, + "is_encrypted": is_encrypted, + "total_page": total_page, + "page_width_pts": int(page_width_pts), + "page_height_pts": int(page_height_pts), + "image_info_per_page": image_info_per_page, + "text_len_per_page": text_len_per_page, + "text_layout_per_page": text_layout_per_page, + "text_language": text_language, + # "svgs_per_page": svgs_per_page, + "imgs_per_page": imgs_per_page, # 增加每页img数量list + "junk_img_bojids": junk_img_bojids, # 增加垃圾图片的bojid list + "invalid_chars": invalid_chars, + "metadata": doc.metadata + } + # logger.info(json.dumps(res, ensure_ascii=False)) + return res + + +@click.command() +@click.option('--s3-pdf-path', help='s3上pdf文件的路径') +@click.option('--s3-profile', help='s3上的profile') +def main(s3_pdf_path: str, s3_profile: str): + """ + + """ + try: + file_content = read_file(s3_pdf_path, s3_profile) + pdf_meta_scan(file_content) + except Exception as e: + print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr) + logger.exception(e) + + +if __name__ == '__main__': + main() + # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf" + # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf" + # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf" + # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf" + # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","") + # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","") + # doc = fitz.open("pdf", file_content) + # text_layout_lst = get_pdf_text_layout_per_page(doc) + # print(text_layout_lst) diff --git a/magic_pdf/layout/__init__.py b/magic_pdf/layout/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/layout/bbox_sort.py b/magic_pdf/layout/bbox_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..5e1508ff5c3dc41a44035af494a486ad605c1cad --- /dev/null +++ b/magic_pdf/layout/bbox_sort.py @@ -0,0 +1,681 @@ +# 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None +# 其中x0, y0代表左上角坐标,x1, y1代表右下角坐标,坐标原点在左上角。 + + + +from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page +from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap +from magic_pdf.libs.commons import mymax + +X0_IDX = 0 +Y0_IDX = 1 +X1_IDX = 2 +Y1_IDX = 3 +CONTENT_IDX = 4 +IDX_X = 5 +IDX_Y = 6 +CONTENT_TYPE_IDX = 7 + +X0_EXT_IDX = 8 +Y0_EXT_IDX = 9 +X1_EXT_IDX = 10 +Y1_EXT_IDX = 11 + + +def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page): + """ + text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json + 把bbox重新组装成一个list,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是pymupdf里的block结构 + """ + all_bboxes = [] + + for image in image_info: + box = image['bbox'] + # 由于没有实现横向的栏切分,因此在这里先过滤掉一些小的图片。这些图片有可能影响layout,造成没有横向栏切分的情况下,layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1 + # 把长宽都小于50的去掉 + if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50: + continue + all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None]) + + for table in table_info: + box = table['bbox'] + all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None]) + + """由于公式与段落混合,因此公式不再参与layout划分,无需加入all_bboxes""" + # 加入文本block + text_block_temp = [] + for block in text_raw_blocks: + bbox = block['bbox'] + text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None]) + + text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp) + text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox,有可能让layout探测陷入无限循环 + + + """找出会影响layout的色块、横向分割线""" + spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], ) + # 还要去掉存在于spilter_bboxes里的text_block + if len(spilter_bboxes) > 0: + text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])] + + for bbox in text_block_new: + all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None]) + + for bbox in spilter_bboxes: + all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None]) + + + return all_bboxes + +def resolve_bbox_overlap_for_layout_det(bboxes:list): + """ + 1. 去掉bbox互相包含的,去掉被包含的 + 2. 上下方向上如果有重叠,就扩大大box范围,直到覆盖小box + """ + def _is_in_other_bbox(i:int): + """ + 判断i个box是否被其他box有所包含 + """ + for j in range(0, len(bboxes)): + if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]): + return True + # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]): + # return True + + return False + + # 首先去掉被包含的bbox + new_bbox_1 = [] + for i in range(0, len(bboxes)): + if not _is_in_other_bbox(i): + new_bbox_1.append(bboxes[i]) + + # 其次扩展大的box + new_box = [] + new_bbox_2 = [] + len_1 = len(new_bbox_2) + while True: + merged_idx = [] + for i in range(0, len(new_bbox_1)): + if i in merged_idx: + continue + for j in range(i+1, len(new_bbox_1)): + if j in merged_idx: + continue + bx1 = new_bbox_1[i] + bx2 = new_bbox_1[j] + if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]): + merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]]) + new_bbox_2.append(merged_box) + merged_idx.append(i) + merged_idx.append(j) + + for i in range(0, len(new_bbox_1)): # 没有合并的加入进来 + if i not in merged_idx: + new_bbox_2.append(new_bbox_1[i]) + + if len(new_bbox_2)==0 or len_1==len(new_bbox_2): + break + else: + len_1 = len(new_bbox_2) + new_box = new_bbox_2 + new_bbox_1, new_bbox_2 = new_bbox_2, [] + + return new_box + + +def filter_lines_bbox(bboxes: list): + """ + 过滤掉bbox为空的行 + """ + new_box = [] + for box in bboxes: + x0, y0, x1, y1 = box[0], box[1], box[2], box[3] + if abs(x0-x1)<=1 or abs(y0-y1)<=1: + continue + else: + new_box.append(box) + return new_box + + +################################################################################ +# 第一种排序算法 +# 以下是基于延长线遮挡做的一个算法 +# +################################################################################ +def find_all_left_bbox(this_bbox, all_bboxes) -> list: + """ + 寻找this_bbox左边的所有bbox + """ + left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]] + return left_boxes + + +def find_all_top_bbox(this_bbox, all_bboxes) -> list: + """ + 寻找this_bbox上面的所有bbox + """ + top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]] + return top_boxes + + +def get_and_set_idx_x(this_bbox, all_bboxes) -> int: + """ + 寻找this_bbox在all_bboxes中的遮挡深度 idx_x + """ + if this_bbox[IDX_X] is not None: + return this_bbox[IDX_X] + else: + all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes) + if len(all_left_bboxes) == 0: + this_bbox[IDX_X] = 0 + else: + all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes] + max_idx_x = mymax(all_left_bboxes_idx) + this_bbox[IDX_X] = max_idx_x + 1 + return this_bbox[IDX_X] + + +def get_and_set_idx_y(this_bbox, all_bboxes) -> int: + """ + 寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y + """ + if this_bbox[IDX_Y] is not None: + return this_bbox[IDX_Y] + else: + all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes) + if len(all_top_bboxes) == 0: + this_bbox[IDX_Y] = 0 + else: + all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes] + max_idx_y = mymax(all_top_bboxes_idx) + this_bbox[IDX_Y] = max_idx_y + 1 + return this_bbox[IDX_Y] + + +def bbox_sort(all_bboxes: list): + """ + 排序 + """ + all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes] + all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes] + all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)] + + all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序 + all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes)) + all_bboxes_idx.sort(key=lambda x: x[0]) + sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx] + return sorted_bboxes + + +################################################################################ +# 第二种排序算法 +# 下面的算法在计算idx_x和idx_y的时候不考虑延长线,而只考虑实际的长或者宽被遮挡的情况 +# +################################################################################ + +def find_left_nearest_bbox(this_bbox, all_bboxes) -> list: + """ + 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox + """ + left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([ + box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX], + this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX], + box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])] + + # 然后再过滤一下,找到水平上距离this_bbox最近的那个 + if len(left_boxes) > 0: + left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True) + left_boxes = [left_boxes[0]] + else: + left_boxes = [] + return left_boxes + + +def get_and_set_idx_x_2(this_bbox, all_bboxes): + """ + 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x + 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况 + """ + if this_bbox[IDX_X] is not None: + return this_bbox[IDX_X] + else: + left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes) + if len(left_nearest_bbox) == 0: + this_bbox[IDX_X] = 0 + else: + left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes) + this_bbox[IDX_X] = left_idx_x + 1 + return this_bbox[IDX_X] + + +def find_top_nearest_bbox(this_bbox, all_bboxes) -> list: + """ + 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox + """ + top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([ + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + # 然后再过滤一下,找到水平上距离this_bbox最近的那个 + if len(top_boxes) > 0: + top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True) + top_boxes = [top_boxes[0]] + else: + top_boxes = [] + return top_boxes + + +def get_and_set_idx_y_2(this_bbox, all_bboxes): + """ + 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y + 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况 + """ + if this_bbox[IDX_Y] is not None: + return this_bbox[IDX_Y] + else: + top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes) + if len(top_nearest_bbox) == 0: + this_bbox[IDX_Y] = 0 + else: + top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes) + this_bbox[IDX_Y] = top_idx_y + 1 + return this_bbox[IDX_Y] + + +def paper_bbox_sort(all_bboxes: list, page_width, page_height): + all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes] + all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes] + all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)] + + all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序 + all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes)) + all_bboxes_idx.sort(key=lambda x: x[0]) + sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx] + return sorted_bboxes + +################################################################################ +""" +第三种排序算法, 假设page的最左侧为X0,最右侧为X1,最上侧为Y0,最下侧为Y1 +这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下: +1. 首先在水平方向上对bbox进行扩展。扩展方法是: + - 对每个bbox,找到其左边最近的bbox(也就是y方向有重叠),然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox,那么就将其左边界扩展到page的最左侧X0。 + - 对每个bbox,找到其右边最近的bbox(也就是y方向有重叠),然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox,那么就将其右边界扩展到page的最右侧X1。 + - 经过上面2个步骤,bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1] + +2. 合并所有的连续水平方向的bbox, 合并方法是: + - 对bbox进行y方向排序,然后从上到下遍历所有bbox,如果当前bbox和下一个bbox的x0, x1等于X0, X1,那么就合并这两个bbox。 + +3. 然后在垂直方向上对bbox进行扩展。扩展方法是: + - 首先从page上切割掉合并后的水平bbox, 得到几个新的block + 针对每个block + - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有,则x0=X0 + - x1: 找到位于右侧x=x1延长线右侧所有的bboxes, 找到最小的x0, 让x1=x0-1。如果没有,则x1=X1 + 随后在垂直方向上合并所有的连续的block,方法如下: + - 对block进行x方向排序,然后从左到右遍历所有block,如果当前block和下一个block的x0, x1相等,那么就合并这两个block。 + 如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’ + 如果在某个垂直方向上无法被完全分割到一个block,那么就将这个block打上标签'BAD_LAYOUT'。 + 至此完成,一个页面的预处理,天然的block要么属于'GOOD_LAYOUT',要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面,可以先按照自上而下,自左到右进行天然排序,也可以先过滤掉这种书籍。 + (完成条件下次加强:进行水平方向切分,把混乱的layout部分尽可能切割出去) +""" +################################################################################ +def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list: + """ + 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox + 这里使用扩展之后的bbox + """ + left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([ + box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX], + this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX], + box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])] + + # 然后再过滤一下,找到水平上距离this_bbox最近的那个 + if len(left_boxes) > 0: + left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True) + left_boxes = left_boxes + else: + left_boxes = [] + return left_boxes + +def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list: + """ + 在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox + 这里使用扩展之后的bbox + """ + top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([ + box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX], + this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX], + box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])] + # 然后再过滤一下,找到水平上距离this_bbox最近的那个 + if len(top_boxes) > 0: + top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True) + top_boxes = top_boxes + else: + top_boxes = [] + return top_boxes + +def get_and_set_idx_x_2_ext(this_bbox, all_bboxes): + """ + 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x + 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况 + """ + if this_bbox[IDX_X] is not None: + return this_bbox[IDX_X] + else: + left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes) + if len(left_nearest_bbox) == 0: + this_bbox[IDX_X] = 0 + else: + left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox] + this_bbox[IDX_X] = mymax(left_idx_x) + 1 + return this_bbox[IDX_X] + +def get_and_set_idx_y_2_ext(this_bbox, all_bboxes): + """ + 寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y + 这个遮挡深度不考虑延长线,而是被实际的长或者宽遮挡的情况 + """ + if this_bbox[IDX_Y] is not None: + return this_bbox[IDX_Y] + else: + top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes) + if len(top_nearest_bbox) == 0: + this_bbox[IDX_Y] = 0 + else: + top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox] + this_bbox[IDX_Y] = mymax(top_idx_y) + 1 + return this_bbox[IDX_Y] + +def _paper_bbox_sort_ext(all_bboxes: list): + all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes] + all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes] + all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)] + + all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx] # 变换成一个点,保证能够先X,X相同时按Y排序 + all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes)) + all_bboxes_idx.sort(key=lambda x: x[0]) + sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx] + return sorted_bboxes + +# =============================================================================================== +def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list: + """ + 寻找this_bbox左边的所有bbox, 使用延长线 + """ + left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]] + if len(left_boxes): + left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True) + left_boxes = left_boxes[0] + else: + left_boxes = None + + return left_boxes + +def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list: + """ + 寻找this_bbox右边的所有bbox, 使用延长线 + """ + right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]] + if len(right_boxes): + right_boxes.sort(key=lambda x: x[X0_IDX]) + right_boxes = right_boxes[0] + else: + right_boxes = None + return right_boxes + +# ============================================================================================= + +def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list: + """ + 在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox, 不用延长线并且不能像 + """ + left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([ + box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX], + this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX], + box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])] + + # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个 + if len(left_boxes) > 0: + left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True) + left_boxes = left_boxes[0] + else: + left_boxes = None + return left_boxes + +def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([ + this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX], + box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX], + box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])] + + if len(right_bboxes)>0: + right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX]) + right_bboxes = right_bboxes[0] + else: + right_bboxes = None + return right_bboxes + +def reset_idx_x_y(all_boxes:list)->list: + for box in all_boxes: + box[IDX_X] = None + box[IDX_Y] = None + + return all_boxes + +# =================================================================================================== +def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list: + """ + 找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([ + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + # 然后再过滤一下,找到上方距离this_bbox最近的那个 + if len(top_bboxes) > 0: + top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True) + top_bboxes = top_bboxes[0] + else: + top_bboxes = None + return top_bboxes + +def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list: + """ + 找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([ + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + # 然后再过滤一下,找到水平上距离this_bbox最近的那个 + if len(bottom_bboxes) > 0: + bottom_bboxes.sort(key=lambda x: x[Y0_IDX]) + bottom_bboxes = bottom_bboxes[0] + else: + bottom_bboxes = None + return bottom_bboxes + +def find_boundry_bboxes(bboxes:list) -> tuple: + """ + 找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1) + """ + x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX] + for box in bboxes: + x0 = min(box[X0_IDX], x0) + y0 = min(box[Y0_IDX], y0) + x1 = max(box[X1_IDX], x1) + y1 = max(box[Y1_IDX], y1) + + return x0, y0, x1, y1 + + +def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list: + """ + 在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox + """ + for box in bboxes: + top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes) + bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes) + if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列 + box[X0_EXT_IDX] = box[X0_IDX] + box[Y0_EXT_IDX] = boundry_y0 + box[X1_EXT_IDX] = box[X1_IDX] + box[Y1_EXT_IDX] = boundry_y1 + # else: + # if top_nearest_bbox is None: + # box[Y0_EXT_IDX] = boundry_y0 + # else: + # box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1 + # if bottom_nearest_bbox is None: + # box[Y1_EXT_IDX] = boundry_y1 + # else: + # box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1 + # box[X0_EXT_IDX] = box[X0_IDX] + # box[X1_EXT_IDX] = box[X1_IDX] + return bboxes + + +# =================================================================================================== + +def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int): + """ + 增加预处理行为的排序: + return: + [ + { + "layout_bbox": [x0, y0, x1, y1], + "layout_label":"GOOD_LAYOUT/BAD_LAYOUT", + "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序 + } + ] + """ + sorted_layouts = [] # 最后的返回结果 + page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1 + + all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序 + # 首先在水平方向上扩展独占一行的bbox + for bbox in all_bboxes: + left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线 + right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes) + if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行 + bbox[X0_EXT_IDX] = page_x0 + bbox[Y0_EXT_IDX] = bbox[Y0_IDX] + bbox[X1_EXT_IDX] = page_x1 + bbox[Y1_EXT_IDX] = bbox[Y1_IDX] + + # 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group + if len(all_bboxes)==1: + return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}] + if len(all_bboxes)==0: + return [] + + """ + 然后合并所有连续水平方向的bbox. + + """ + all_bboxes.sort(key=lambda x: x[Y0_IDX]) + h_bboxes = [] + h_bbox_group = [] + v_boxes = [] + + for bbox in all_bboxes: + if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1: + h_bbox_group.append(bbox) + else: + if len(h_bbox_group)>0: + h_bboxes.append(h_bbox_group) + h_bbox_group = [] + # 最后一个group + if len(h_bbox_group)>0: + h_bboxes.append(h_bbox_group) + + """ + 现在h_bboxes里面是所有的group了,每个group都是一个list + 对h_bboxes里的每个group进行计算放回到sorted_layouts里 + """ + for gp in h_bboxes: + gp.sort(key=lambda x: x[Y0_IDX]) + block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp} + # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1 + x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX] + block_info["layout_bbox"] = [x0, y0, x1, y1] + sorted_layouts.append(block_info) + + # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分 + h_split_lines = [page_y0] + for gp in h_bboxes: + layout_bbox = gp['layout_bbox'] + y0, y1 = layout_bbox[1], layout_bbox[3] + h_split_lines.append(y0) + h_split_lines.append(y1) + h_split_lines.append(page_y1) + + unsplited_bboxes = [] + for i in range(0, len(h_split_lines), 2): + start_y0, start_y1 = h_split_lines[i:i+2] + # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块 + bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1] + unsplited_bboxes.append(bboxes_in_block) + # ================== 至此,水平方向的 已经切分排序完毕==================================== + """ + 接下来针对每个非水平的部分切分垂直方向的 + 此时,只剩下了无法被完全水平打通的bbox了。对这些box,优先进行垂直扩展,然后进行垂直切分. + 分3步: + 1. 先把能完全垂直打通的隔离出去当做一个layout + 2. 其余的先垂直切分 + 3. 垂直切分之后的部分再尝试水平切分 + 4. 剩下的不能被切分的各个部分当成一个layout + """ + # 对每部分进行垂直切分 + for bboxes_in_block in unsplited_bboxes: + # 首先对这个block的bbox进行垂直方向上的扩展 + boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block) + # 进行垂直方向上的扩展 + extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1) + # 然后对这个block进行垂直方向上的切分 + extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大,代表了从左到右读取 + v_boxes_group = [] + for bbox in extended_vertical_bboxes: + if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1: + v_boxes_group.append(bbox) + else: + if len(v_boxes_group)>0: + v_boxes.append(v_boxes_group) + v_boxes_group = [] + + if len(v_boxes_group)>0: + + v_boxes.append(v_boxes_group) + + # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了,因为上面已经做了 + for gp in v_boxes: + gp.sort(key=lambda x: x[X0_IDX]) + block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp} + # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1 + x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX] + block_info["layout_bbox"] = [x0, y0, x1, y1] + sorted_layouts.append(block_info) + + # 在垂直方向上,划分子块,也就是用贯通的垂直线进行切分。这些被切分出来的块,极大可能是可被垂直切分的,如果不能完全的垂直切分,那么尝试水平切分。都不能的则当成一个layout + v_split_lines = [boundry_x0] + for gp in v_boxes: + layout_bbox = gp['layout_bbox'] + x0, x1 = layout_bbox[0], layout_bbox[2] + v_split_lines.append(x0) + v_split_lines.append(x1) + v_split_lines.append(boundry_x1) + + reset_idx_x_y(all_bboxes) + all_boxes = _paper_bbox_sort_ext(all_bboxes) + return all_boxes + + + + + + + + diff --git a/magic_pdf/layout/layout_det_utils.py b/magic_pdf/layout/layout_det_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8b2b36cc071c9726bdb6ca03a3e5b98ded4eeb24 --- /dev/null +++ b/magic_pdf/layout/layout_det_utils.py @@ -0,0 +1,182 @@ +from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX +from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect + + +def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list: + """ + 在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox, 不用延长线 + 并且要考虑两个box左右相交的情况,如果相交了,那么右侧的box就不算最左侧。 + """ + left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] + and any([ + box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX], + this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX], + box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])] + + # 然后再过滤一下,找到水平上距离this_bbox最近的那个——x1最大的那个 + if len(left_boxes) > 0: + left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True) + left_boxes = left_boxes[0] + else: + left_boxes = None + return left_boxes + +def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] + and any([ + this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX], + box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX], + box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])] + + if len(right_bboxes)>0: + right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX]) + right_bboxes = right_bboxes[0] + else: + right_bboxes = None + return right_bboxes + +def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([ + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + + if len(top_bboxes)>0: + top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True) + top_bboxes = top_bboxes[0] + else: + top_bboxes = None + return top_bboxes + +def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([ + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + + if len(bottom_bboxes)>0: + bottom_bboxes.sort(key=lambda x: x[Y0_IDX]) + bottom_bboxes = bottom_bboxes[0] + else: + bottom_bboxes = None + return bottom_bboxes + +# =================================================================================================================== +def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([ + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + + if len(bottom_bboxes)>0: + # y0最小, X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右 + bottom_bboxes.sort(key=lambda x: x[Y0_IDX]) + bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]] + # 然后再y1相同的情况下,找到x1最大的那个 + bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True) + bottom_bboxes = bottom_bboxes[0] + else: + bottom_bboxes = None + return bottom_bboxes + +def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([ + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + + if len(bottom_bboxes)>0: + # y0最小, X0最小的那个 + bottom_bboxes.sort(key=lambda x: x[Y0_IDX]) + bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]] + # 然后再y0相同的情况下,找到x0最小的那个 + bottom_bboxes.sort(key=lambda x: x[X0_IDX]) + bottom_bboxes = bottom_bboxes[0] + else: + bottom_bboxes = None + return bottom_bboxes + +def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([ + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + + if len(top_bboxes)>0: + # y1最大, X0最小的那个 + top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True) + top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]] + # 然后再y1相同的情况下,找到x0最小的那个 + top_bboxes.sort(key=lambda x: x[X0_IDX]) + top_bboxes = top_bboxes[0] + else: + top_bboxes = None + return top_bboxes + +def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list: + """ + 找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种 + """ + top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([ + box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX], + this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX], + box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])] + + if len(top_bboxes)>0: + # y1最大, X1最大的那个 + top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True) + top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]] + # 然后再y1相同的情况下,找到x1最大的那个 + top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True) + top_bboxes = top_bboxes[0] + else: + top_bboxes = None + return top_bboxes + +# =================================================================================================================== + +def get_left_edge_bboxes(all_bboxes) -> list: + """ + 返回最左边的bbox + """ + left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None] + return left_bboxes + +def get_right_edge_bboxes(all_bboxes) -> list: + """ + 返回最右边的bbox + """ + right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None] + return right_bboxes + +def fix_vertical_bbox_pos(bboxes:list): + """ + 检查这批bbox在垂直方向是否有轻微的重叠,如果重叠了,就把重叠的bbox往下移动一点 + 在x方向上必须一个包含或者被包含,或者完全重叠,不能只有部分重叠 + """ + bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列 + for i in range(0, len(bboxes)): + for j in range(i+1, len(bboxes)): + if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]): + # 如果两个bbox有部分重叠,那么就把下面的bbox往下移动一点 + bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值 + break + return bboxes diff --git a/magic_pdf/layout/layout_sort.py b/magic_pdf/layout/layout_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..6b387d4c97e318aaa831b92a5238a5f9d67e7d11 --- /dev/null +++ b/magic_pdf/layout/layout_sort.py @@ -0,0 +1,732 @@ +""" +对pdf上的box进行layout识别,并对内部组成的box进行排序 +""" + +from loguru import logger +from magic_pdf.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort +from magic_pdf.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes +from magic_pdf.libs.boxbase import get_bbox_in_boundry + + +LAYOUT_V = "V" +LAYOUT_H = "H" +LAYOUT_UNPROC = "U" +LAYOUT_BAD = "B" + +def _is_single_line_text(bbox): + """ + 检查bbox里面的文字是否只有一行 + """ + return True # TODO + box_type = bbox[CONTENT_TYPE_IDX] + if box_type != 'text': + return False + paras = bbox[CONTENT_IDX]["paras"] + text_content = "" + for para_id, para in paras.items(): # 拼装内部的段落文本 + is_title = para['is_title'] + if is_title!=0: + text_content += f"## {para['text']}" + else: + text_content += para["text"] + text_content += "\n\n" + + return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split("\n\n")) <= 1 + + +def _horizontal_split(bboxes:list, boundry:tuple, avg_font_size=20)-> list: + """ + 对bboxes进行水平切割 + 方法是:找到左侧和右侧都没有被直接遮挡的box,然后进行扩展,之后进行切割 + return: + 返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平,u代表未探测的,v代表垂直布局 + """ + sorted_layout_blocks = [] # 这是要最终返回的值 + + bound_x0, bound_y0, bound_x1, bound_y1 = boundry + all_bboxes = get_bbox_in_boundry(bboxes, boundry) + #all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。 + """ + 首先在水平方向上扩展独占一行的bbox + + """ + last_h_split_line_y1 = bound_y0 #记录下上次的水平分割线 + for i, bbox in enumerate(all_bboxes): + left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes) # 非扩展线 + right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes) + if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行 + """ + 然而,如果只是孤立的一行文字,那么就还要满足以下几个条件才可以: + 1. bbox和中心线相交。或者 + 2. 上方或者下方也存在同类水平的独占一行的bbox。 或者 + 3. TODO 加强条件:这个bbox上方和下方是同一列column,那么就不能算作独占一行 + """ + # 先检查这个bbox里是否只包含一行文字 + is_single_line = _is_single_line_text(bbox) + """ + 这里有个点需要注意,当页面内容不是居中的时候,第一次调用传递的是page的boundry,这个时候mid_x就不是中心线了. + 所以这里计算出最紧致的boundry,然后再计算mid_x + """ + boundry_real_x0, boundry_real_x1 = min([bbox[X0_IDX] for bbox in all_bboxes]), max([bbox[X1_IDX] for bbox in all_bboxes]) + mid_x = (boundry_real_x0+boundry_real_x1)/2 + # 检查这个box是否内容在中心线有交 + # 必须跨过去2个字符的宽度 + is_cross_boundry_mid_line = min(mid_x-bbox[X0_IDX], bbox[X1_IDX]-mid_x) > avg_font_size*2 + """ + 检查条件2 + """ + is_belong_to_col = False + """ + 检查是否能被上方col吸收,方法是: + 1. 上方非空且不是独占一行的,并且 + 2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1] + """ + """ + 以迭代的方式向上找,查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]] + """ + #先确定上方的y0, y0 + b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX] + #然后从box开始逐个向上找到所有与box在x上有交集的box + box_to_check = [bound_x0, b_y0, bound_x1, b_y1] + bbox_in_bound_check = get_bbox_in_boundry(all_bboxes, box_to_check) + + bboxes_on_top = [] + virtual_box = bbox + while True: + b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check) + if b_on_top is not None: + bboxes_on_top.append(b_on_top) + virtual_box = [min([virtual_box[X0_IDX], b_on_top[X0_IDX]]), min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]), max([virtual_box[X1_IDX], b_on_top[X1_IDX]]), b_y1] + else: + break + + # 随后确定这些box的最小x0, 最大x1 + if len(bboxes_on_top)>0 and len(bboxes_on_top) != len(bbox_in_bound_check):# virtual_box可能会膨胀到占满整个区域,这实际上就不能属于一个col了。 + min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX] + # 然后采用一种比较粗糙的方法,看min_x0,max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交 + + if not any([b[X0_IDX] <= min_x0-1 <= b[X1_IDX] or b[X0_IDX] <= max_x1+1 <= b[X1_IDX] for b in bbox_in_bound_check]): + # 其上,下都不能被扩展成行,暂时只检查一下上方 TODO + top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes) + bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes) + if not any([ + top_nearest_bbox is not None and (find_all_left_bbox_direct(top_nearest_bbox, bboxes) is None and find_all_right_bbox_direct(top_nearest_bbox, bboxes) is None), + bottom_nearest_bbox is not None and (find_all_left_bbox_direct(bottom_nearest_bbox, bboxes) is None and find_all_right_bbox_direct(bottom_nearest_bbox, bboxes) is None), + top_nearest_bbox is None or bottom_nearest_bbox is None + ]): + is_belong_to_col = True + + # 检查是否能被下方col吸收 TODO + + """ + 这里为什么没有is_cross_boundry_mid_line的条件呢? + 确实有些杂志左右两栏宽度不是对称的。 + """ + if not is_belong_to_col or is_cross_boundry_mid_line: + bbox[X0_EXT_IDX] = bound_x0 + bbox[Y0_EXT_IDX] = bbox[Y0_IDX] + bbox[X1_EXT_IDX] = bound_x1 + bbox[Y1_EXT_IDX] = bbox[Y1_IDX] + last_h_split_line_y1 = bbox[Y1_IDX] # 更新这条线 + else: + continue + """ + 此时独占一行的被成功扩展到指定的边界上,这个时候利用边界条件合并连续的bbox,成为一个group + 然后合并所有连续水平方向的bbox. + """ + all_bboxes.sort(key=lambda x: x[Y0_IDX]) + h_bboxes = [] + h_bbox_group = [] + + for bbox in all_bboxes: + if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1: + h_bbox_group.append(bbox) + else: + if len(h_bbox_group)>0: + h_bboxes.append(h_bbox_group) + h_bbox_group = [] + # 最后一个group + if len(h_bbox_group)>0: + h_bboxes.append(h_bbox_group) + + """ + 现在h_bboxes里面是所有的group了,每个group都是一个list + 对h_bboxes里的每个group进行计算放回到sorted_layouts里 + """ + h_layouts = [] + for gp in h_bboxes: + gp.sort(key=lambda x: x[Y0_IDX]) + # 然后计算这个group的layout_bbox,也就是最小的x0,y0, 最大的x1,y1 + x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX] + h_layouts.append([x0, y0, x1, y1, LAYOUT_H]) # 水平的布局 + + """ + 接下来利用这些连续的水平bbox的layout_bbox的y0, y1,从水平上切分开其余的为几个部分 + """ + h_split_lines = [bound_y0] + for gp in h_bboxes: # gp是一个list[bbox_list] + y0, y1 = gp[0][1], gp[-1][3] + h_split_lines.append(y0) + h_split_lines.append(y1) + h_split_lines.append(bound_y1) + + unsplited_bboxes = [] + for i in range(0, len(h_split_lines), 2): + start_y0, start_y1 = h_split_lines[i:i+2] + # 然后找出[start_y0, start_y1]之间的其他bbox,这些组成一个未分割板块 + bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1] + unsplited_bboxes.append(bboxes_in_block) + # 接着把未处理的加入到h_layouts里 + for bboxes_in_block in unsplited_bboxes: + if len(bboxes_in_block) == 0: + continue + x0, y0, x1, y1 = bound_x0, min([bbox[Y0_IDX] for bbox in bboxes_in_block]), bound_x1, max([bbox[Y1_IDX] for bbox in bboxes_in_block]) + h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC]) + + h_layouts.sort(key=lambda x: x[1]) # 按照y0排序, 也就是从上到下的顺序 + + """ + 转换成如下格式返回 + """ + for layout in h_layouts: + sorted_layout_blocks.append({ + "layout_bbox": layout[:4], + "layout_label":layout[4], + "sub_layout":[], + }) + return sorted_layout_blocks + +############################################################################################### +# +# 垂直方向的处理 +# +# +############################################################################################### +def _vertical_align_split_v1(bboxes:list, boundry:tuple)-> list: + """ + 计算垂直方向上的对齐, 并分割bboxes成layout。负责对一列多行的进行列维度分割。 + 如果不能完全分割,剩余部分作为layout_lable为u的layout返回 + ----------------------- + | | | + | | | + | | | + | | | + ------------------------- + 此函数会将:以上布局将会切分出来2列 + """ + sorted_layout_blocks = [] # 这是要最终返回的值 + new_boundry = [boundry[0], boundry[1], boundry[2], boundry[3]] + + v_blocks = [] + """ + 先从左到右切分 + """ + while True: + all_bboxes = get_bbox_in_boundry(bboxes, new_boundry) + left_edge_bboxes = get_left_edge_bboxes(all_bboxes) + if len(left_edge_bboxes) == 0: + break + right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes])+1 + # 然后检查这条线能不与其他bbox的左边界相交或者重合 + if any([bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]): + # 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。 + break + else: # 说明成功分割出一列 + # 找到左侧边界最靠左的bbox作为layout的x0 + layout_x0 = min([bbox[X0_IDX] for bbox in left_edge_bboxes]) # 这里主要是为了画出来有一定间距 + v_blocks.append([layout_x0, new_boundry[1], right_split_line_x1, new_boundry[3], LAYOUT_V]) + new_boundry[0] = right_split_line_x1 # 更新边界 + + """ + 再从右到左切, 此时如果还是无法完全切分,那么剩余部分作为layout_lable为u的layout返回 + """ + unsplited_block = [] + while True: + all_bboxes = get_bbox_in_boundry(bboxes, new_boundry) + right_edge_bboxes = get_right_edge_bboxes(all_bboxes) + if len(right_edge_bboxes) == 0: + break + left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes])-1 + # 然后检查这条线能不与其他bbox的左边界相交或者重合 + if any([bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]): + # 这里是余下的 + unsplited_block.append([new_boundry[0], new_boundry[1], new_boundry[2], new_boundry[3], LAYOUT_UNPROC]) + break + else: + # 找到右侧边界最靠右的bbox作为layout的x1 + layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes]) + v_blocks.append([left_split_line_x0, new_boundry[1], layout_x1, new_boundry[3], LAYOUT_V]) + new_boundry[2] = left_split_line_x0 # 更新右边界 + + """ + 最后拼装成layout格式返回 + """ + for block in v_blocks: + sorted_layout_blocks.append({ + "layout_bbox": block[:4], + "layout_label":block[4], + "sub_layout":[], + }) + for block in unsplited_block: + sorted_layout_blocks.append({ + "layout_bbox": block[:4], + "layout_label":block[4], + "sub_layout":[], + }) + + # 按照x0排序 + sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0]) + return sorted_layout_blocks + +def _vertical_align_split_v2(bboxes:list, boundry:tuple)-> list: + """ + 改进的 _vertical_align_split算法,原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分,导致整个layout多列被识别为一列。 + 利用从左上角的box开始向下看的方法,不断扩展w_x0, w_x1,直到不能继续向下扩展,或者到达边界下边界。 + """ + sorted_layout_blocks = [] # 这是要最终返回的值 + new_boundry = [boundry[0], boundry[1], boundry[2], boundry[3]] + bad_boxes = [] # 被割中的box + v_blocks = [] + while True: + all_bboxes = get_bbox_in_boundry(bboxes, new_boundry) + if len(all_bboxes) == 0: + break + left_top_box = min(all_bboxes, key=lambda x: (x[X0_IDX],x[Y0_IDX]))# 这里应该加强,检查一下必须是在第一列的 TODO + start_box = [left_top_box[X0_IDX], left_top_box[Y0_IDX], left_top_box[X1_IDX], left_top_box[Y1_IDX]] + w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX] + """ + 然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1 + 扩展之后,宽度会增加,随后用x=w_x1来检测在边界内是否有box与相交,如果相交,那么就说明不能再扩展了。 + 当不能扩展的时候就要看是否到达下边界: + 1. 达到,那么更新左边界继续分下一个列 + 2. 没有达到,那么此时开始从右侧切分进入下面的循环里 + """ + while left_top_box is not None: # 向下去找 + virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]] + left_top_box = find_bottom_bbox_direct_from_left_edge(virtual_box, all_bboxes) + if left_top_box: + w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max([virtual_box[X1_IDX], left_top_box[X1_IDX]]) + # 万一这个初始的box在column中间,那么还要向上看 + start_box = [w_x0, start_box[Y0_IDX], w_x1, start_box[Y1_IDX]] # 扩展一下宽度更鲁棒 + left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes) + while left_top_box is not None: # 向上去找 + virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]] + left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes) + if left_top_box: + w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max([virtual_box[X1_IDX], left_top_box[X1_IDX]]) + + # 检查相交 + if any([bbox[X0_IDX] <= w_x1+1 <= bbox[X1_IDX] for bbox in all_bboxes]): + for b in all_bboxes: + if b[X0_IDX] <= w_x1+1 <= b[X1_IDX]: + bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]]) + break + else: # 说明成功分割出一列 + v_blocks.append([w_x0, new_boundry[1], w_x1, new_boundry[3], LAYOUT_V]) + new_boundry[0] = w_x1 # 更新边界 + + """ + 接着开始从右上角的box扫描 + """ + w_x0 , w_x1 = 0, 0 + unsplited_block = [] + while True: + all_bboxes = get_bbox_in_boundry(bboxes, new_boundry) + if len(all_bboxes) == 0: + break + # 先找到X1最大的 + bbox_list_sorted = sorted(all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True) + # Then, find the boxes with the smallest Y0 value + bigest_x1 = bbox_list_sorted[0][X1_IDX] + boxes_with_bigest_x1 = [bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1] # 也就是最靠右的那些 + right_top_box = min(boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]) # y0最小的那个 + start_box = [right_top_box[X0_IDX], right_top_box[Y0_IDX], right_top_box[X1_IDX], right_top_box[Y1_IDX]] + w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX] + + while right_top_box is not None: + virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]] + right_top_box = find_bottom_bbox_direct_from_right_edge(virtual_box, all_bboxes) + if right_top_box: + w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max([w_x1, right_top_box[X1_IDX]]) + # 在向上扫描 + start_box = [w_x0, start_box[Y0_IDX], w_x1, start_box[Y1_IDX]] # 扩展一下宽度更鲁棒 + right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes) + while right_top_box is not None: + virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]] + right_top_box = find_top_bbox_direct_from_right_edge(virtual_box, all_bboxes) + if right_top_box: + w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max([w_x1, right_top_box[X1_IDX]]) + + # 检查是否与其他box相交, 垂直切分线与某些box发生相交,说明无法完全垂直方向切分。 + if any([bbox[X0_IDX] <= w_x0-1 <= bbox[X1_IDX] for bbox in all_bboxes]): + unsplited_block.append([new_boundry[0], new_boundry[1], new_boundry[2], new_boundry[3], LAYOUT_UNPROC]) + for b in all_bboxes: + if b[X0_IDX] <= w_x0-1 <= b[X1_IDX]: + bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]]) + break + else: # 说明成功分割出一列 + v_blocks.append([w_x0, new_boundry[1], w_x1, new_boundry[3], LAYOUT_V]) + new_boundry[2] = w_x0 + + """转换数据结构""" + for block in v_blocks: + sorted_layout_blocks.append({ + "layout_bbox": block[:4], + "layout_label":block[4], + "sub_layout":[], + }) + + for block in unsplited_block: + sorted_layout_blocks.append({ + "layout_bbox": block[:4], + "layout_label":block[4], + "sub_layout":[], + "bad_boxes": bad_boxes # 记录下来,这个box是被割中的 + }) + + + # 按照x0排序 + sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0]) + return sorted_layout_blocks + + + + +def _try_horizontal_mult_column_split(bboxes:list, boundry:tuple)-> list: + """ + 尝试水平切分,如果切分不动,那就当一个BAD_LAYOUT返回 + ------------------ + | | | + ------------------ + | | | | <- 这里是此函数要切分的场景 + ------------------ + | | | + | | | + """ + pass + + + + +def _vertical_split(bboxes:list, boundry:tuple)-> list: + """ + 从垂直方向进行切割,分block + 这个版本里,如果垂直切分不动,那就当一个BAD_LAYOUT返回 + + -------------------------- + | | | + | | | + | | + 这种列是此函数要切分的 -> | | + | | + | | | + | | | + ------------------------- + """ + sorted_layout_blocks = [] # 这是要最终返回的值 + + bound_x0, bound_y0, bound_x1, bound_y1 = boundry + all_bboxes = get_bbox_in_boundry(bboxes, boundry) + """ + all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖 + all_bboxes = fix_hor_bbox_pos(all_bboxes) # 水平解覆盖 + + 这两行代码目前先不执行,因为公式检测,表格检测还不是很成熟,导致非常多的textblock参与了运算,时间消耗太大。 + 这两行代码的作用是: + 如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩,从而避免重叠。对布局切分来说带来正反馈。 + """ + + #all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。 + """ + 首先在垂直方向上扩展独占一行的bbox + + """ + for bbox in all_bboxes: + top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes) # 非扩展线 + bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes) + if top_nearest_bbox is None and bottom_nearest_bbox is None and not any([b[X0_IDX]=start_x0 and bbox[X1_IDX]<=start_x1] + unsplited_bboxes.append(bboxes_in_block) + # 接着把未处理的加入到v_layouts里 + for bboxes_in_block in unsplited_bboxes: + if len(bboxes_in_block) == 0: + continue + x0, y0, x1, y1 = min([bbox[X0_IDX] for bbox in bboxes_in_block]), bound_y0, max([bbox[X1_IDX] for bbox in bboxes_in_block]), bound_y1 + v_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC]) # 说明这篇区域未能够分析出可靠的版面 + + v_layouts.sort(key=lambda x: x[0]) # 按照x0排序, 也就是从左到右的顺序 + + for layout in v_layouts: + sorted_layout_blocks.append({ + "layout_bbox": layout[:4], + "layout_label":layout[4], + "sub_layout":[], + }) + + """ + 至此,垂直方向切成了2种类型,其一是独占一列的,其二是未处理的。 + 下面对这些未处理的进行垂直方向切分,这个切分要切出来类似“吕”这种类型的垂直方向的布局 + """ + for i, layout in enumerate(sorted_layout_blocks): + if layout['layout_label'] == LAYOUT_UNPROC: + x0, y0, x1, y1 = layout['layout_bbox'] + v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1]) + sorted_layout_blocks[i] = { + "layout_bbox": [x0, y0, x1, y1], + "layout_label": LAYOUT_H, + "sub_layout": v_split_layouts + } + layout['layout_label'] = LAYOUT_H # 被垂线切分成了水平布局 + + return sorted_layout_blocks + + +def split_layout(bboxes:list, boundry:tuple, page_num:int)-> list: + """ + 把bboxes切割成layout + return: + [ + { + "layout_bbox": [x0, y0, x1, y1], + "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT + "sub_layout": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序 + } + ] + example: + [ + { + "layout_bbox": [0, 0, 100, 100], + "layout_label":"u|v|h|b", + "sub_layout":[ + + ] + }, + { + "layout_bbox": [0, 0, 100, 100], + "layout_label":"u|v|h|b", + "sub_layout":[ + { + "layout_bbox": [0, 0, 100, 100], + "layout_label":"u|v|h|b", + "content_bboxes":[ + [], + [], + [] + ] + }, + { + "layout_bbox": [0, 0, 100, 100], + "layout_label":"u|v|h|b", + "sub_layout":[ + + ] + } + } + ] + """ + sorted_layouts = [] # 最终返回的结果 + + boundry_x0, boundry_y0, boundry_x1, boundry_y1 = boundry + if len(bboxes) <=1: + return [ + { + "layout_bbox": [boundry_x0, boundry_y0, boundry_x1, boundry_y1], + "layout_label": LAYOUT_V, + "sub_layout":[] + } + ] + + """ + 接下来按照先水平后垂直的顺序进行切分 + """ + bboxes = paper_bbox_sort(bboxes, boundry_x1-boundry_x0, boundry_y1-boundry_y0) + sorted_layouts = _horizontal_split(bboxes, boundry) # 通过水平分割出来的layout + for i, layout in enumerate(sorted_layouts): + x0, y0, x1, y1 = layout['layout_bbox'] + layout_type = layout['layout_label'] + if layout_type == LAYOUT_UNPROC: # 说明是非独占单行的,这些需要垂直切分 + v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1]) + + """ + 最后这里有个逻辑问题:如果这个函数只分离出来了一个column layout,那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的 + box已经把行全部剥离了,所以这里必须十多个列才可以。如果只剥离出来一个layout,并且是多个box,那么就说明这个layout是无法分割的,标记为LAYOUT_UNPROC + """ + layout_label = LAYOUT_V + if len(v_split_layouts) == 1: + if len(v_split_layouts[0]['sub_layout']) == 0: + layout_label = LAYOUT_UNPROC + #logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts) + + """ + 组合起来最终的layout + """ + sorted_layouts[i] = { + "layout_bbox": [x0, y0, x1, y1], + "layout_label": layout_label, + "sub_layout": v_split_layouts + } + layout['layout_label'] = LAYOUT_H + + """ + 水平和垂直方向都切分完毕了。此时还有一些未处理的,这些未处理的可能是因为水平和垂直方向都无法切分。 + 这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分,如果也不能切分最终就当做BAD_LAYOUT返回 + """ + # TODO + + return sorted_layouts + + +def get_bboxes_layout(all_boxes:list, boundry:tuple, page_id:int): + """ + 对利用layout排序之后的box,进行排序 + return: + [ + { + "layout_bbox": [x0, y0, x1, y1], + "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT + }, + ] + """ + def _preorder_traversal(layout): + """ + 对sorted_layouts的叶子节点,也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序,也就是从上到下,从左到右的顺序 + """ + sorted_layout_blocks = [] + for layout in layout: + sub_layout = layout['sub_layout'] + if len(sub_layout) == 0: + sorted_layout_blocks.append(layout) + else: + s = _preorder_traversal(sub_layout) + sorted_layout_blocks.extend(s) + return sorted_layout_blocks + # ------------------------------------------------------------------------------------------------------------------------- + sorted_layouts = split_layout(all_boxes, boundry, page_id)# 先切分成layout,得到一个Tree + total_sorted_layout_blocks = _preorder_traversal(sorted_layouts) + return total_sorted_layout_blocks, sorted_layouts + + +def get_columns_cnt_of_layout(layout_tree): + """ + 获取一个layout的宽度 + """ + max_width_list = [0] # 初始化一个元素,防止max,min函数报错 + + for items in layout_tree: # 针对每一层(横切)计算列数,横着的算一列 + layout_type = items['layout_label'] + sub_layouts = items['sub_layout'] + if len(sub_layouts)==0: + max_width_list.append(1) + else: + if layout_type == LAYOUT_H: + max_width_list.append(1) + else: + width = 0 + for l in sub_layouts: + if len(l['sub_layout']) == 0: + width += 1 + else: + for lay in l['sub_layout']: + width += get_columns_cnt_of_layout([lay]) + max_width_list.append(width) + + return max(max_width_list) + + + +def sort_with_layout(bboxes:list, page_width, page_height) -> (list,list): + """ + 输入是一个bbox的list. + 获取到输入之后,先进行layout切分,然后对这些bbox进行排序。返回排序后的bboxes + """ + + new_bboxes = [] + for box in bboxes: + # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None]) + new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None, box[4]]) + + layout_bboxes, _ = get_bboxes_layout(new_bboxes, [0, 0, page_width, page_height], 0) + if any([lay['layout_label']==LAYOUT_UNPROC for lay in layout_bboxes]): + logger.warning(f"drop this pdf, reason: 复杂版面") + return None,None + + sorted_bboxes = [] + # 利用layout bbox每次框定一些box,然后排序 + for layout in layout_bboxes: + lbox = layout['layout_bbox'] + bbox_in_layout = get_bbox_in_boundry(new_bboxes, lbox) + sorted_bbox = paper_bbox_sort(bbox_in_layout, lbox[2]-lbox[0], lbox[3]-lbox[1]) + sorted_bboxes.extend(sorted_bbox) + + return sorted_bboxes, layout_bboxes + + +def sort_text_block(text_block, layout_bboxes): + """ + 对一页的text_block进行排序 + """ + sorted_text_bbox = [] + all_text_bbox = [] + # 做一个box=>text的映射 + box_to_text = {} + for blk in text_block: + box = blk['bbox'] + box_to_text[(box[0], box[1], box[2], box[3])] = blk + all_text_bbox.append(box) + + # text_blocks_to_sort = [] + # for box in box_to_text.keys(): + # text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None]) + + # 按照layout_bboxes的顺序,对text_block进行排序 + for layout in layout_bboxes: + layout_box = layout['layout_bbox'] + text_bbox_in_layout = get_bbox_in_boundry(all_text_bbox, [layout_box[0]-1, layout_box[1]-1, layout_box[2]+1, layout_box[3]+1]) + #sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1]) + text_bbox_in_layout.sort(key = lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序 + #sorted_bbox = [[b] for b in text_blocks_to_sort] + for sb in text_bbox_in_layout: + sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])]) + + return sorted_text_bbox diff --git a/magic_pdf/layout/layout_spiler_recog.py b/magic_pdf/layout/layout_spiler_recog.py new file mode 100644 index 0000000000000000000000000000000000000000..ea9d0410fa53acc9eed7d17985d0596eb7d643ac --- /dev/null +++ b/magic_pdf/layout/layout_spiler_recog.py @@ -0,0 +1,101 @@ +""" +找到能分割布局的水平的横线、色块 +""" + +import os +from magic_pdf.libs.commons import fitz +from magic_pdf.libs.boxbase import _is_in_or_part_overlap + + +def __rect_filter_by_width(rect, page_w, page_h): + mid_x = page_w/2 + if rect[0]< mid_x < rect[2]: + return True + return False + + +def __rect_filter_by_pos(rect, image_bboxes, table_bboxes): + """ + 不能出现在table和image的位置 + """ + for box in image_bboxes: + if _is_in_or_part_overlap(rect, box): + return False + + for box in table_bboxes: + if _is_in_or_part_overlap(rect, box): + return False + + return True + + +def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,): + save_path = "./tmp/debug.pdf" + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open('') + + width = page.rect.width + height = page.rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes1: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in bboxes2: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in bboxes3: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=None) + shape.finish() + shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + +def get_spilter_of_page(page, image_bboxes, table_bboxes): + """ + 获取到色块和横线 + """ + cdrawings = page.get_cdrawings() + + spilter_bbox = [] + for block in cdrawings: + if 'fill' in block: + fill = block['fill'] + if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0): + rect = block['rect'] + if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes): + spilter_bbox.append(list(rect)) + + """过滤、修正一下这些box。因为有时候会有一些矩形,高度为0或者为负数,造成layout计算无限循环。如果是负高度或者0高度,统一修正为高度为1""" + for box in spilter_bbox: + if box[3]-box[1] <= 0: + box[3] = box[1] + 1 + + #__debug_show_page(page, spilter_bbox, [], []) + + return spilter_bbox diff --git a/magic_pdf/layout/mcol_sort.py b/magic_pdf/layout/mcol_sort.py new file mode 100644 index 0000000000000000000000000000000000000000..f0580c26a3fc6f60264bfccabd8ccb4cef87ae15 --- /dev/null +++ b/magic_pdf/layout/mcol_sort.py @@ -0,0 +1,336 @@ +""" +This is an advanced PyMuPDF utility for detecting multi-column pages. +It can be used in a shell script, or its main function can be imported and +invoked as descript below. + +Features +--------- +- Identify text belonging to (a variable number of) columns on the page. +- Text with different background color is handled separately, allowing for + easier treatment of side remarks, comment boxes, etc. +- Uses text block detection capability to identify text blocks and + uses the block bboxes as primary structuring principle. +- Supports ignoring footers via a footer margin parameter. +- Returns re-created text boundary boxes (integer coordinates), sorted ascending + by the top, then by the left coordinates. + +Restrictions +------------- +- Only supporting horizontal, left-to-right text +- Returns a list of text boundary boxes - not the text itself. The caller is + expected to extract text from within the returned boxes. +- Text written above images is ignored altogether (option). +- This utility works as expected in most cases. The following situation cannot + be handled correctly: + * overlapping (non-disjoint) text blocks + * image captions are not recognized and are handled like normal text + +Usage +------ +- As a CLI shell command use + + python multi_column.py input.pdf footer_margin + + Where footer margin is the height of the bottom stripe to ignore on each page. + This code is intended to be modified according to your need. + +- Use in a Python script as follows: + + ---------------------------------------------------------------------------------- + from multi_column import column_boxes + + # for each page execute + bboxes = column_boxes(page, footer_margin=50, no_image_text=True) + + # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0, + # then x0 coordinates. Their text content can be extracted by all PyMuPDF + # get_text() variants, like for instance the following: + for rect in bboxes: + print(page.get_text(clip=rect, sort=True)) + ---------------------------------------------------------------------------------- +""" +import sys +from magic_pdf.libs.commons import fitz + + +def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True): + """Determine bboxes which wrap a column.""" + paths = page.get_drawings() + bboxes = [] + + # path rectangles + path_rects = [] + + # image bboxes + img_bboxes = [] + + # bboxes of non-horizontal text + # avoid when expanding horizontal text boxes + vert_bboxes = [] + + # compute relevant page area + clip = +page.rect + clip.y1 -= footer_margin # Remove footer area + clip.y0 += header_margin # Remove header area + + def can_extend(temp, bb, bboxlist): + """Determines whether rectangle 'temp' can be extended by 'bb' + without intersecting any of the rectangles contained in 'bboxlist'. + + Items of bboxlist may be None if they have been removed. + + Returns: + True if 'temp' has no intersections with items of 'bboxlist'. + """ + for b in bboxlist: + if not intersects_bboxes(temp, vert_bboxes) and ( + b == None or b == bb or (temp & b).is_empty + ): + continue + return False + + return True + + def in_bbox(bb, bboxes): + """Return 1-based number if a bbox contains bb, else return 0.""" + for i, bbox in enumerate(bboxes): + if bb in bbox: + return i + 1 + return 0 + + def intersects_bboxes(bb, bboxes): + """Return True if a bbox intersects bb, else return False.""" + for bbox in bboxes: + if not (bb & bbox).is_empty: + return True + return False + + def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes): + """Extend a bbox to the right page border. + + Whenever there is no text to the right of a bbox, enlarge it up + to the right page border. + + Args: + bboxes: (list[IRect]) bboxes to check + width: (int) page width + path_bboxes: (list[IRect]) bboxes with a background color + vert_bboxes: (list[IRect]) bboxes with vertical text + img_bboxes: (list[IRect]) bboxes of images + Returns: + Potentially modified bboxes. + """ + for i, bb in enumerate(bboxes): + # do not extend text with background color + if in_bbox(bb, path_bboxes): + continue + + # do not extend text in images + if in_bbox(bb, img_bboxes): + continue + + # temp extends bb to the right page border + temp = +bb + temp.x1 = width + + # do not cut through colored background or images + if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes): + continue + + # also, do not intersect other text bboxes + check = can_extend(temp, bb, bboxes) + if check: + bboxes[i] = temp # replace with enlarged bbox + + return [b for b in bboxes if b != None] + + def clean_nblocks(nblocks): + """Do some elementary cleaning.""" + + # 1. remove any duplicate blocks. + blen = len(nblocks) + if blen < 2: + return nblocks + start = blen - 1 + for i in range(start, -1, -1): + bb1 = nblocks[i] + bb0 = nblocks[i - 1] + if bb0 == bb1: + del nblocks[i] + + # 2. repair sequence in special cases: + # consecutive bboxes with almost same bottom value are sorted ascending + # by x-coordinate. + y1 = nblocks[0].y1 # first bottom coordinate + i0 = 0 # its index + i1 = -1 # index of last bbox with same bottom + + # Iterate over bboxes, identifying segments with approx. same bottom value. + # Replace every segment by its sorted version. + for i in range(1, len(nblocks)): + b1 = nblocks[i] + if abs(b1.y1 - y1) > 10: # different bottom + if i1 > i0: # segment length > 1? Sort it! + nblocks[i0 : i1 + 1] = sorted( + nblocks[i0 : i1 + 1], key=lambda b: b.x0 + ) + y1 = b1.y1 # store new bottom value + i0 = i # store its start index + i1 = i # store current index + if i1 > i0: # segment waiting to be sorted + nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0) + return nblocks + + # extract vector graphics + for p in paths: + path_rects.append(p["rect"].irect) + path_bboxes = path_rects + + # sort path bboxes by ascending top, then left coordinates + path_bboxes.sort(key=lambda b: (b.y0, b.x0)) + + # bboxes of images on page, no need to sort them + for item in page.get_images(): + img_bboxes.extend(page.get_image_rects(item[0])) + + # blocks of text on page + blocks = page.get_text( + "dict", + flags=fitz.TEXTFLAGS_TEXT, + clip=clip, + )["blocks"] + + # Make block rectangles, ignoring non-horizontal text + for b in blocks: + bbox = fitz.IRect(b["bbox"]) # bbox of the block + + # ignore text written upon images + if no_image_text and in_bbox(bbox, img_bboxes): + continue + + # confirm first line to be horizontal + line0 = b["lines"][0] # get first line + if line0["dir"] != (1, 0): # only accept horizontal text + vert_bboxes.append(bbox) + continue + + srect = fitz.EMPTY_IRECT() + for line in b["lines"]: + lbbox = fitz.IRect(line["bbox"]) + text = "".join([s["text"].strip() for s in line["spans"]]) + if len(text) > 1: + srect |= lbbox + bbox = +srect + + if not bbox.is_empty: + bboxes.append(bbox) + + # Sort text bboxes by ascending background, top, then left coordinates + bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0)) + + # Extend bboxes to the right where possible + bboxes = extend_right( + bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes + ) + + # immediately return of no text found + if bboxes == []: + return [] + + # -------------------------------------------------------------------- + # Join bboxes to establish some column structure + # -------------------------------------------------------------------- + # the final block bboxes on page + nblocks = [bboxes[0]] # pre-fill with first bbox + bboxes = bboxes[1:] # remaining old bboxes + + for i, bb in enumerate(bboxes): # iterate old bboxes + check = False # indicates unwanted joins + + # check if bb can extend one of the new blocks + for j in range(len(nblocks)): + nbb = nblocks[j] # a new block + + # never join across columns + if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0: + continue + + # never join across different background colors + if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes): + continue + + temp = bb | nbb # temporary extension of new block + check = can_extend(temp, nbb, nblocks) + if check == True: + break + + if not check: # bb cannot be used to extend any of the new bboxes + nblocks.append(bb) # so add it to the list + j = len(nblocks) - 1 # index of it + temp = nblocks[j] # new bbox added + + # check if some remaining bbox is contained in temp + check = can_extend(temp, bb, bboxes) + if check == False: + nblocks.append(bb) + else: + nblocks[j] = temp + bboxes[i] = None + + # do some elementary cleaning + nblocks = clean_nblocks(nblocks) + + # return identified text bboxes + return nblocks + + +if __name__ == "__main__": + """Only for debugging purposes, currently. + + Draw red borders around the returned text bboxes and insert + the bbox number. + Then save the file under the name "input-blocks.pdf". + """ + + # get the file name + filename = sys.argv[1] + + # check if footer margin is given + if len(sys.argv) > 2: + footer_margin = int(sys.argv[2]) + else: # use default vaue + footer_margin = 50 + + # check if header margin is given + if len(sys.argv) > 3: + header_margin = int(sys.argv[3]) + else: # use default vaue + header_margin = 50 + + # open document + doc = fitz.open(filename) + + # iterate over the pages + for page in doc: + # remove any geometry issues + page.wrap_contents() + + # get the text bboxes + bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin) + + # prepare a canvas to draw rectangles and text + shape = page.new_shape() + + # iterate over the bboxes + for i, rect in enumerate(bboxes): + shape.draw_rect(rect) # draw a border + + # write sequence number + shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"]) + + # finish drawing / text with color red + shape.finish(color=fitz.pdfcolor["red"]) + shape.commit() # store to the page + + # save document with text bboxes + doc.ez_save(filename.replace(".pdf", "-blocks.pdf")) \ No newline at end of file diff --git a/magic_pdf/libs/Constants.py b/magic_pdf/libs/Constants.py new file mode 100644 index 0000000000000000000000000000000000000000..ec0fa93282ec9b13cbfd72e013f5c618a96d243b --- /dev/null +++ b/magic_pdf/libs/Constants.py @@ -0,0 +1,11 @@ +""" +span维度自定义字段 +""" +# span是否是跨页合并的 +CROSS_PAGE = "cross_page" + +""" +block维度自定义字段 +""" +# block中lines是否被删除 +LINES_DELETED = "lines_deleted" \ No newline at end of file diff --git a/magic_pdf/libs/MakeContentConfig.py b/magic_pdf/libs/MakeContentConfig.py new file mode 100644 index 0000000000000000000000000000000000000000..b1650affcfd3514a5a1d317e243dab58785ef452 --- /dev/null +++ b/magic_pdf/libs/MakeContentConfig.py @@ -0,0 +1,10 @@ +class MakeMode: + MM_MD = "mm_markdown" + NLP_MD = "nlp_markdown" + STANDARD_FORMAT = "standard_format" + + +class DropMode: + WHOLE_PDF = "whole_pdf" + SINGLE_PAGE = "single_page" + NONE = "none" diff --git a/magic_pdf/libs/ModelBlockTypeEnum.py b/magic_pdf/libs/ModelBlockTypeEnum.py new file mode 100644 index 0000000000000000000000000000000000000000..494da6f6d7cd0b49dc7a08dcc54e7503c96eb41d --- /dev/null +++ b/magic_pdf/libs/ModelBlockTypeEnum.py @@ -0,0 +1,9 @@ +from enum import Enum + +class ModelBlockTypeEnum(Enum): + TITLE = 0 + PLAIN_TEXT = 1 + ABANDON = 2 + ISOLATE_FORMULA = 8 + EMBEDDING = 13 + ISOLATED = 14 \ No newline at end of file diff --git a/magic_pdf/libs/__init__.py b/magic_pdf/libs/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py new file mode 100644 index 0000000000000000000000000000000000000000..1e4e34308ad1f3995198b32a377121c8eb73c6ad --- /dev/null +++ b/magic_pdf/libs/boxbase.py @@ -0,0 +1,408 @@ + + +from loguru import logger +import math + +def _is_in_or_part_overlap(box1, box2) -> bool: + """ + 两个bbox是否有部分重叠或者包含 + """ + if box1 is None or box2 is None: + return False + + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + return not (x1_1 < x0_2 or # box1在box2的左边 + x0_1 > x1_2 or # box1在box2的右边 + y1_1 < y0_2 or # box1在box2的上边 + y0_1 > y1_2) # box1在box2的下边 + +def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6): + """ + 判断box1是否在box2里面,或者box1和box2有部分重叠,且重叠面积占box1的比例超过area_ratio_threshold + + """ + if box1 is None or box2 is None: + return False + + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + if not _is_in_or_part_overlap(box1, box2): + return False + + # 计算重叠面积 + x_left = max(x0_1, x0_2) + y_top = max(y0_1, y0_2) + x_right = min(x1_1, x1_2) + y_bottom = min(y1_1, y1_2) + overlap_area = (x_right - x_left) * (y_bottom - y_top) + + # 计算box1的面积 + box1_area = (x1_1 - x0_1) * (y1_1 - y0_1) + + return overlap_area / box1_area > area_ratio_threshold + + +def _is_in(box1, box2) -> bool: + """ + box1是否完全在box2里面 + """ + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + return (x0_1 >= x0_2 and # box1的左边界不在box2的左边外 + y0_1 >= y0_2 and # box1的上边界不在box2的上边外 + x1_1 <= x1_2 and # box1的右边界不在box2的右边外 + y1_1 <= y1_2) # box1的下边界不在box2的下边外 + +def _is_part_overlap(box1, box2) -> bool: + """ + 两个bbox是否有部分重叠,但不完全包含 + """ + if box1 is None or box2 is None: + return False + + return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2) + +def _left_intersect(left_box, right_box): + "检查两个box的左边界是否有交集,也就是left_box的右边界是否在right_box的左边界内" + if left_box is None or right_box is None: + return False + + x0_1, y0_1, x1_1, y1_1 = left_box + x0_2, y0_2, x1_2, y1_2 = right_box + + return x1_1>x0_2 and x0_1x1_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1) + + +def _is_vertical_full_overlap(box1, box2, x_torlence=2): + """ + x方向上:要么box1包含box2, 要么box2包含box1。不能部分包含 + y方向上:box1和box2有重叠 + """ + # 解析box的坐标 + x11, y11, x12, y12 = box1 # 左上角和右下角的坐标 (x1, y1, x2, y2) + x21, y21, x22, y22 = box2 + + # 在x轴方向上,box1是否包含box2 或 box2包含box1 + contains_in_x = (x11-x_torlence <= x21 and x12+x_torlence >= x22) or (x21-x_torlence <= x11 and x22+x_torlence >= x12) + + # 在y轴方向上,box1和box2是否有重叠 + overlap_in_y = not (y12 < y21 or y11 > y22) + + return contains_in_x and overlap_in_y + + +def _is_bottom_full_overlap(box1, box2, y_tolerance=2): + """ + 检查box1下方和box2的上方有轻微的重叠,轻微程度收到y_tolerance的限制 + 这个函数和_is_vertical-full_overlap的区别是,这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度 + """ + if box1 is None or box2 is None: + return False + + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + tolerance_margin = 2 + is_xdir_full_overlap = ((x0_1-tolerance_margin<=x0_2<=x1_1+tolerance_margin and x0_1-tolerance_margin<=x1_2<=x1_1+tolerance_margin) or (x0_2-tolerance_margin<=x0_1<=x1_2+tolerance_margin and x0_2-tolerance_margin<=x1_1<=x1_2+tolerance_margin)) + + return y0_2= 0.5 or ratio_2 >= 0.5 + + #vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2 + return x0_1<=x0_2<=x1_1 and vertical_overlap_cond + + +def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8): + """检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%""" + _, y0_1, _, y1_1 = bbox1 + _, y0_2, _, y1_2 = bbox2 + + overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2)) + height1, height2 = y1_1 - y0_1, y1_2 - y0_2 + max_height = max(height1, height2) + min_height = min(height1, height2) + + return (overlap / min_height) > overlap_ratio_threshold + + + +def calculate_iou(bbox1, bbox2): + """ + 计算两个边界框的交并比(IOU)。 + + Args: + bbox1 (list[float]): 第一个边界框的坐标,格式为 [x1, y1, x2, y2],其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 + bbox2 (list[float]): 第二个边界框的坐标,格式与 `bbox1` 相同。 + + Returns: + float: 两个边界框的交并比(IOU),取值范围为 [0, 1]。 + + """ + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + + # The area of both rectangles + bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) + bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) + + # Compute the intersection over union by taking the intersection area + # and dividing it by the sum of both areas minus the intersection area + iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area) + return iou + + +def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2): + """ + 计算box1和box2的重叠面积占最小面积的box的比例 + """ + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + min_box_area = min([(bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]), (bbox2[3]-bbox2[1])*(bbox2[2]-bbox2[0])]) + if min_box_area==0: + return 0 + else: + return intersection_area / min_box_area + +def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2): + """ + 计算box1和box2的重叠面积占bbox1的比例 + """ + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], bbox2[0]) + y_top = max(bbox1[1], bbox2[1]) + x_right = min(bbox1[2], bbox2[2]) + y_bottom = min(bbox1[3], bbox2[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]) + if bbox1_area == 0: + return 0 + else: + return intersection_area / bbox1_area + + +def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio): + """ + 通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例 + 如果比例大于ratio,则返回小的那个bbox, + 否则返回None + """ + x1_min, y1_min, x1_max, y1_max = bbox1 + x2_min, y2_min, x2_max, y2_max = bbox2 + area1 = (x1_max - x1_min) * (y1_max - y1_min) + area2 = (x2_max - x2_min) * (y2_max - y2_min) + overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2) + if overlap_ratio > ratio: + if area1 <= area2: + return bbox1 + else: + return bbox2 + else: + return None + +def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list: + x0, y0, x1, y1 = boundry + new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1] + return new_boxes + + +def is_vbox_on_side(bbox, width, height, side_threshold=0.2): + """ + 判断一个bbox是否在pdf页面的边缘 + """ + x0, x1 = bbox[0], bbox[2] + if x1<=width*side_threshold or x0>=width*(1-side_threshold): + return True + return False + +def find_top_nearest_text_bbox(pymu_blocks, obj_bbox): + tolerance_margin = 4 + top_boxes = [box for box in pymu_blocks if obj_bbox[1]-box['bbox'][3] >=-tolerance_margin and not _is_in(box['bbox'], obj_bbox)] + # 然后找到X方向上有互相重叠的 + top_boxes = [box for box in top_boxes if any([obj_bbox[0]-tolerance_margin <=box['bbox'][0]<=obj_bbox[2]+tolerance_margin, + obj_bbox[0]-tolerance_margin <=box['bbox'][2]<=obj_bbox[2]+tolerance_margin, + box['bbox'][0]-tolerance_margin <=obj_bbox[0]<=box['bbox'][2]+tolerance_margin, + box['bbox'][0]-tolerance_margin <=obj_bbox[2]<=box['bbox'][2]+tolerance_margin + ])] + + # 然后找到y1最大的那个 + if len(top_boxes)>0: + top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True) + return top_boxes[0] + else: + return None + + +def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox): + bottom_boxes = [box for box in pymu_blocks if box['bbox'][1] - obj_bbox[3]>=-2 and not _is_in(box['bbox'], obj_bbox)] + # 然后找到X方向上有互相重叠的 + bottom_boxes = [box for box in bottom_boxes if any([obj_bbox[0]-2 <=box['bbox'][0]<=obj_bbox[2]+2, + obj_bbox[0]-2 <=box['bbox'][2]<=obj_bbox[2]+2, + box['bbox'][0]-2 <=obj_bbox[0]<=box['bbox'][2]+2, + box['bbox'][0]-2 <=obj_bbox[2]<=box['bbox'][2]+2 + ])] + + # 然后找到y0最小的那个 + if len(bottom_boxes)>0: + bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False) + return bottom_boxes[0] + else: + return None + +def find_left_nearest_text_bbox(pymu_blocks, obj_bbox): + """ + 寻找左侧最近的文本block + """ + left_boxes = [box for box in pymu_blocks if obj_bbox[0]-box['bbox'][2]>=-2 and not _is_in(box['bbox'], obj_bbox)] + # 然后找到X方向上有互相重叠的 + left_boxes = [box for box in left_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2, + obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2, + box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2, + box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2 + ])] + + # 然后找到x1最大的那个 + if len(left_boxes)>0: + left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True) + return left_boxes[0] + else: + return None + + +def find_right_nearest_text_bbox(pymu_blocks, obj_bbox): + """ + 寻找右侧最近的文本block + """ + right_boxes = [box for box in pymu_blocks if box['bbox'][0]-obj_bbox[2]>=-2 and not _is_in(box['bbox'], obj_bbox)] + # 然后找到X方向上有互相重叠的 + right_boxes = [box for box in right_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2, + obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2, + box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2, + box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2 + ])] + + # 然后找到x0最小的那个 + if len(right_boxes)>0: + right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False) + return right_boxes[0] + else: + return None + + +def bbox_relative_pos(bbox1, bbox2): + """ + 判断两个矩形框的相对位置关系 + + Args: + bbox1: 一个四元组,表示第一个矩形框的左上角和右下角的坐标,格式为(x1, y1, x1b, y1b) + bbox2: 一个四元组,表示第二个矩形框的左上角和右下角的坐标,格式为(x2, y2, x2b, y2b) + + Returns: + 一个四元组,表示矩形框1相对于矩形框2的位置关系,格式为(left, right, bottom, top) + 其中,left表示矩形框1是否在矩形框2的左侧,right表示矩形框1是否在矩形框2的右侧, + bottom表示矩形框1是否在矩形框2的下方,top表示矩形框1是否在矩形框2的上方 + + """ + x1, y1, x1b, y1b = bbox1 + x2, y2, x2b, y2b = bbox2 + + left = x2b < x1 + right = x1b < x2 + bottom = y2b < y1 + top = y1b < y2 + return left, right, bottom, top + +def bbox_distance(bbox1, bbox2): + """ + 计算两个矩形框的距离。 + + Args: + bbox1 (tuple): 第一个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 + bbox2 (tuple): 第二个矩形框的坐标,格式为 (x1, y1, x2, y2),其中 (x1, y1) 为左上角坐标,(x2, y2) 为右下角坐标。 + + Returns: + float: 矩形框之间的距离。 + + """ + def dist(point1, point2): + return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2) + + x1, y1, x1b, y1b = bbox1 + x2, y2, x2b, y2b = bbox2 + + left, right, bottom, top = bbox_relative_pos(bbox1, bbox2) + + if top and left: + return dist((x1, y1b), (x2b, y2)) + elif left and bottom: + return dist((x1, y1), (x2b, y2b)) + elif bottom and right: + return dist((x1b, y1), (x2, y2b)) + elif right and top: + return dist((x1b, y1b), (x2, y2)) + elif left: + return x1 - x2b + elif right: + return x2 - x1b + elif bottom: + return y1 - y2b + elif top: + return y2 - y1b + else: # rectangles intersect + return 0 \ No newline at end of file diff --git a/magic_pdf/libs/calc_span_stats.py b/magic_pdf/libs/calc_span_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..c0bf61a8d541c17d6c654614896edeaefa053a78 --- /dev/null +++ b/magic_pdf/libs/calc_span_stats.py @@ -0,0 +1,239 @@ +import os +import csv +import json +import pandas as pd +from pandas import DataFrame as df +from matplotlib import pyplot as plt +from termcolor import cprint + +""" +Execute this script in the following way: + +1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following: + + code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json + +2. Under the directory code-clean, execute the following command: + + $ python -m libs.calc_span_stats + +""" + + +def print_green_on_red(text): + cprint(text, "green", "on_red", attrs=["bold"], end="\n\n") + + +def print_green(text): + print() + cprint(text, "green", attrs=["bold"], end="\n\n") + + +def print_red(text): + print() + cprint(text, "red", attrs=["bold"], end="\n\n") + + +def safe_get(dict_obj, key, default): + val = dict_obj.get(key) + if val is None: + return default + else: + return val + + +class SpanStatsCalc: + """Calculate statistics of span.""" + + def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str): + """Draw multiple figures in one figure.""" + # make a canvas + fig = plt.figure(fig_num, figsize=(20, 20)) + + pass + + def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame: + """Calculate statistics per pdf_dict.""" + span_stats = pd.DataFrame() + + span_stats = [] + span_id = 0 + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "para_blocks" in blocks.keys(): + for para_block in blocks["para_blocks"]: + for line in para_block["lines"]: + for span in line["spans"]: + span_text = safe_get(span, "text", "") + span_font_name = safe_get(span, "font", "") + span_font_size = safe_get(span, "size", 0) + span_font_color = safe_get(span, "color", "") + span_font_flags = safe_get(span, "flags", 0) + + span_font_flags_decoded = safe_get(span, "decomposed_flags", {}) + span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False) + span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False) + span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False) + span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False) + span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False) + span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False) + span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False) + + span_stats.append( + { + "span_id": span_id, # id of span + "page_id": page_id, # page number of pdf + "span_text": span_text, # text of span + "span_font_name": span_font_name, # font name of span + "span_font_size": span_font_size, # font size of span + "span_font_color": span_font_color, # font color of span + "span_font_flags": span_font_flags, # font flags of span + "span_is_superscript": int( + span_is_super_script + ), # indicate whether the span is super script or not + "span_is_italic": int(span_is_italic), # indicate whether the span is italic or not + "span_is_serifed": int(span_is_serifed), # indicate whether the span is serifed or not + "span_is_sans_serifed": int( + span_is_sans_serifed + ), # indicate whether the span is sans serifed or not + "span_is_monospaced": int( + span_is_monospaced + ), # indicate whether the span is monospaced or not + "span_is_proportional": int( + span_is_proportional + ), # indicate whether the span is proportional or not + "span_is_bold": int(span_is_bold), # indicate whether the span is bold or not + } + ) + + span_id += 1 + + span_stats = pd.DataFrame(span_stats) + # print(span_stats) + + return span_stats + + +def __find_pdf_dic_files( + jf_name="pdf_dic.json", + base_code_name="code-clean", + tgt_base_dir_name="tmp", + unittest_dir_name="unittest", + md_dir_name="md", + book_names=[ + "scihub", + ], # other possible values: "zlib", "arxiv" and so on +): + pdf_dict_files = [] + + curr_dir = os.path.dirname(__file__) + + for i in range(len(curr_dir)): + if curr_dir[i : i + len(base_code_name)] == base_code_name: + base_code_dir_name = curr_dir[: i + len(base_code_name)] + for book_name in book_names: + search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name) + if os.path.exists(base_code_dir_name): + search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name) + for root, dirs, files in os.walk(search_dir_name): + for file in files: + if file == jf_name: + pdf_dict_files.append(os.path.join(root, file)) + break + + return pdf_dict_files + + +def combine_span_texts(group_df, span_stats): + combined_span_texts = [] + for _, row in group_df.iterrows(): + curr_span_id = row.name + curr_span_text = row["span_text"] + + pre_span_id = curr_span_id - 1 + pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else "" + + next_span_id = curr_span_id + 1 + next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else "" + + # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow + pointer_sign = "→ → → " + combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text]) + combined_span_texts.append(combined_text) + + return "\n\n".join(combined_span_texts) + + +# pd.set_option("display.max_colwidth", None) # 设置为 None 来显示完整的文本 +pd.set_option("display.max_rows", None) # 设置为 None 来显示更多的行 + + +def main(): + pdf_dict_files = __find_pdf_dic_files() + # print(pdf_dict_files) + + span_stats_calc = SpanStatsCalc() + + for pdf_dict_file in pdf_dict_files: + print("-" * 100) + print_green_on_red(f"Processing {pdf_dict_file}") + + with open(pdf_dict_file, "r", encoding="utf-8") as f: + pdf_dict = json.load(f) + + raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict) + save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv") + raw_df.to_csv(save_path, index=False) + + filtered_df = raw_df[raw_df["span_is_superscript"] == 1] + if filtered_df.empty: + print("No superscript span found!") + continue + + filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"]) + + combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df) # type: ignore + + final_df = filtered_grouped_df.size().reset_index(name="count") + final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True) + + print(final_df) + + final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n")) + + save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv") + # 使用 UTF-8 编码并添加 BOM,确保所有字段被双引号包围 + final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL) + + # 创建一个 2x2 的图表布局 + fig, axs = plt.subplots(2, 2, figsize=(15, 10)) + + # 按照 span_font_name 分类作图 + final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name") + + # 按照 span_font_size 分类作图 + final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size") + + # 按照 span_font_color 分类作图 + final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color") + + # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图 + grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"]) + grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping") + + # 调整布局 + plt.tight_layout() + + # 显示图表 + # plt.show() + + # 保存图表到 PNG 文件 + save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png") + plt.savefig(save_path) + + # 清除画布 + plt.clf() + + +if __name__ == "__main__": + main() diff --git a/magic_pdf/libs/commons.py b/magic_pdf/libs/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..15592dbfe7b989846e2e0804c4ebfbda6549cf47 --- /dev/null +++ b/magic_pdf/libs/commons.py @@ -0,0 +1,204 @@ +import datetime +import json +import os, re, configparser +import subprocess +import time + +import boto3 +from loguru import logger +from boto3.s3.transfer import TransferConfig +from botocore.config import Config + +import fitz # 1.23.9中已经切换到rebase +# import fitz_old as fitz # 使用1.23.9之前的pymupdf库 + + +def get_delta_time(input_time): + return round(time.time() - input_time, 2) + + +def join_path(*args): + return '/'.join(str(s).rstrip('/') for s in args) + + +#配置全局的errlog_path,方便demo同步引用 +error_log_path = "s3://llm-pdf-text/err_logs/" +# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main +json_dump_path = "s3://llm-pdf-text/json_dump/" + +# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径,应该在业务代码中定义 + + +def get_top_percent_list(num_list, percent): + """ + 获取列表中前百分之多少的元素 + :param num_list: + :param percent: + :return: + """ + if len(num_list) == 0: + top_percent_list = [] + else: + # 对imgs_len_list排序 + sorted_imgs_len_list = sorted(num_list, reverse=True) + # 计算 percent 的索引 + top_percent_index = int(len(sorted_imgs_len_list) * percent) + # 取前80%的元素 + top_percent_list = sorted_imgs_len_list[:top_percent_index] + return top_percent_list + + +def formatted_time(time_stamp): + dt_object = datetime.datetime.fromtimestamp(time_stamp) + output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S") + return output_time + + +def mymax(alist: list): + if len(alist) == 0: + return 0 # 空是0, 0*0也是0大小q + else: + return max(alist) + +def parse_aws_param(profile): + if isinstance(profile, str): + # 解析配置文件 + config_file = join_path(os.path.expanduser("~"), ".aws", "config") + credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials") + config = configparser.ConfigParser() + config.read(credentials_file) + config.read(config_file) + # 获取 AWS 账户相关信息 + ak = config.get(profile, "aws_access_key_id") + sk = config.get(profile, "aws_secret_access_key") + if profile == "default": + s3_str = config.get(f"{profile}", "s3") + else: + s3_str = config.get(f"profile {profile}", "s3") + end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE) + if end_match: + endpoint = end_match.group(1) + else: + raise ValueError(f"aws 配置文件中没有找到 endpoint_url") + style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE) + if style_match: + addressing_style = style_match.group(1) + else: + addressing_style = "path" + elif isinstance(profile, dict): + ak = profile["ak"] + sk = profile["sk"] + endpoint = profile["endpoint"] + addressing_style = "auto" + + return ak, sk, endpoint, addressing_style + + +def parse_bucket_key(s3_full_path: str): + """ + 输入 s3://bucket/path/to/my/file.txt + 输出 bucket, path/to/my/file.txt + """ + s3_full_path = s3_full_path.strip() + if s3_full_path.startswith("s3://"): + s3_full_path = s3_full_path[5:] + if s3_full_path.startswith("/"): + s3_full_path = s3_full_path[1:] + bucket, key = s3_full_path.split("/", 1) + return bucket, key + + +def read_file(pdf_path: str, s3_profile): + if pdf_path.startswith("s3://"): + ak, sk, end_point, addressing_style = parse_aws_param(s3_profile) + cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point, + config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'})) + bucket_name, bucket_key = parse_bucket_key(pdf_path) + res = cli.get_object(Bucket=bucket_name, Key=bucket_key) + file_content = res["Body"].read() + return file_content + else: + with open(pdf_path, "rb") as f: + return f.read() + + +def get_docx_model_output(pdf_model_output, page_id): + + model_output_json = pdf_model_output[page_id] + + return model_output_json + + +def list_dir(dir_path:str, s3_profile:str): + """ + 列出dir_path下的所有文件 + """ + ret = [] + + if dir_path.startswith("s3"): + ak, sk, end_point, addressing_style = parse_aws_param(s3_profile) + s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path) + bucket, path = s3info[0][0], s3info[0][1] + try: + cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point, + config=Config(s3={'addressing_style': addressing_style})) + def list_obj_scluster(): + marker = None + while True: + list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path) + if marker: + list_kwargs['Marker'] = marker + response = cli.list_objects(**list_kwargs) + contents = response.get("Contents", []) + yield from contents + if not response.get("IsTruncated") or len(contents)==0: + break + marker = contents[-1]['Key'] + + + for info in list_obj_scluster(): + file_path = info['Key'] + #size = info['Size'] + + if path!="": + afile = file_path[len(path):] + if afile.endswith(".json"): + ret.append(f"s3://{bucket}/{file_path}") + + return ret + + except Exception as e: + logger.exception(e) + exit(-1) + else: #本地的目录,那么扫描本地目录并返会这个目录里的所有jsonl文件 + + for root, dirs, files in os.walk(dir_path): + for file in files: + if file.endswith(".json"): + ret.append(join_path(root, file)) + ret.sort() + return ret + +def get_img_s3_client(save_path:str, image_s3_config:str): + """ + """ + if save_path.startswith("s3://"): # 放这里是为了最少创建一个s3 client + ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config) + img_s3_client = boto3.client( + service_name="s3", + aws_access_key_id=ak, + aws_secret_access_key=sk, + endpoint_url=end_point, + config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}), + ) + else: + img_s3_client = None + + return img_s3_client + +if __name__=="__main__": + s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/" + s3_profile = "langchao" + ret = list_dir(s3_path, s3_profile) + print(ret) + \ No newline at end of file diff --git a/magic_pdf/libs/config_reader.py b/magic_pdf/libs/config_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..a2a3663f3206883c9af66b3213531e127ccc1826 --- /dev/null +++ b/magic_pdf/libs/config_reader.py @@ -0,0 +1,73 @@ +""" +根据bucket的名字返回对应的s3 AK, SK,endpoint三元组 + +""" + +import json +import os + +from loguru import logger + +from magic_pdf.libs.commons import parse_bucket_key + + +def read_config(): + home_dir = os.path.expanduser("~") + + config_file = os.path.join(home_dir, "magic-pdf.json") + + if not os.path.exists(config_file): + raise Exception(f"{config_file} not found") + + with open(config_file, "r") as f: + config = json.load(f) + return config + + +def get_s3_config(bucket_name: str): + """ + ~/magic-pdf.json 读出来 + """ + config = read_config() + + bucket_info = config.get("bucket_info") + if bucket_name not in bucket_info: + access_key, secret_key, storage_endpoint = bucket_info["[default]"] + else: + access_key, secret_key, storage_endpoint = bucket_info[bucket_name] + + if access_key is None or secret_key is None or storage_endpoint is None: + raise Exception("ak, sk or endpoint not found in magic-pdf.json") + + # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") + + return access_key, secret_key, storage_endpoint + + +def get_s3_config_dict(path: str): + access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path)) + return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint} + + +def get_bucket_name(path): + bucket, key = parse_bucket_key(path) + return bucket + + +def get_local_dir(): + config = read_config() + return config.get("temp-output-dir", "/tmp") + + +def get_local_models_dir(): + config = read_config() + return config.get("models-dir", "/tmp/models") + + +def get_device(): + config = read_config() + return config.get("device-mode", "cpu") + + +if __name__ == "__main__": + ak, sk, endpoint = get_s3_config("llm-raw") diff --git a/magic_pdf/libs/convert_utils.py b/magic_pdf/libs/convert_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..99a1879d46befa2de63aa1a379ab83dbf6fdb1f1 --- /dev/null +++ b/magic_pdf/libs/convert_utils.py @@ -0,0 +1,5 @@ +def dict_to_list(input_dict): + items_list = [] + for _, item in input_dict.items(): + items_list.append(item) + return items_list diff --git a/magic_pdf/libs/coordinate_transform.py b/magic_pdf/libs/coordinate_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..7cd7a0768596174d71ea8b3c8309c0ec998b3c81 --- /dev/null +++ b/magic_pdf/libs/coordinate_transform.py @@ -0,0 +1,9 @@ +def get_scale_ratio(model_page_info, page): + pix = page.get_pixmap(dpi=72) + pymu_width = int(pix.w) + pymu_height = int(pix.h) + width_from_json = model_page_info['page_info']['width'] + height_from_json = model_page_info['page_info']['height'] + horizontal_scale_ratio = width_from_json / pymu_width + vertical_scale_ratio = height_from_json / pymu_height + return horizontal_scale_ratio, vertical_scale_ratio diff --git a/magic_pdf/libs/detect_language_from_model.py b/magic_pdf/libs/detect_language_from_model.py new file mode 100644 index 0000000000000000000000000000000000000000..55abf10584b5a3cfd43643ad02c145642a6f2a78 --- /dev/null +++ b/magic_pdf/libs/detect_language_from_model.py @@ -0,0 +1,21 @@ +from collections import Counter + +from magic_pdf.libs.language import detect_lang + +def get_language_from_model(model_list: list): + language_lst = [] + for ocr_page_info in model_list: + page_text = "" + layout_dets = ocr_page_info["layout_dets"] + for layout_det in layout_dets: + category_id = layout_det["category_id"] + allow_category_id_list = [15] + if category_id in allow_category_id_list: + page_text += layout_det["text"] + page_language = detect_lang(page_text) + language_lst.append(page_language) + # 统计text_language_list中每种语言的个数 + count_dict = Counter(language_lst) + # 输出text_language_list中出现的次数最多的语言 + language = max(count_dict, key=count_dict.get) + return language diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..65b7815232c4a2d03bf57959fdedbecb3230ee0d --- /dev/null +++ b/magic_pdf/libs/draw_bbox.py @@ -0,0 +1,227 @@ +from magic_pdf.libs.Constants import CROSS_PAGE +from magic_pdf.libs.commons import fitz # PyMuPDF +from magic_pdf.libs.ocr_content_type import ContentType, BlockType + + +def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config): + new_rgb = [] + for item in rgb_config: + item = float(item) / 255 + new_rgb.append(item) + page_data = bbox_list[i] + for bbox in page_data: + x0, y0, x1, y1 = bbox + rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle + if fill_config: + page.draw_rect( + rect_coords, + color=None, + fill=new_rgb, + fill_opacity=0.3, + width=0.5, + overlay=True, + ) # Draw the rectangle + else: + page.draw_rect( + rect_coords, + color=new_rgb, + fill=None, + fill_opacity=1, + width=0.5, + overlay=True, + ) # Draw the rectangle + + +def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config): + new_rgb = [] + for item in rgb_config: + item = float(item) / 255 + new_rgb.append(item) + page_data = bbox_list[i] + for j, bbox in enumerate(page_data): + x0, y0, x1, y1 = bbox + rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle + if fill_config: + page.draw_rect( + rect_coords, + color=None, + fill=new_rgb, + fill_opacity=0.3, + width=0.5, + overlay=True, + ) # Draw the rectangle + else: + page.draw_rect( + rect_coords, + color=new_rgb, + fill=None, + fill_opacity=1, + width=0.5, + overlay=True, + ) # Draw the rectangle + page.insert_text( + (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb + ) # Insert the index in the top left corner of the rectangle + + +def draw_layout_bbox(pdf_info, pdf_bytes, out_path): + layout_bbox_list = [] + dropped_bbox_list = [] + tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], [] + imgs_list, imgs_body_list, imgs_caption_list = [], [], [] + titles_list = [] + texts_list = [] + interequations_list = [] + for page in pdf_info: + page_layout_list = [] + page_dropped_list = [] + tables, tables_body, tables_caption, tables_footnote = [], [], [], [] + imgs, imgs_body, imgs_caption = [], [], [] + titles = [] + texts = [] + interequations = [] + for layout in page["layout_bboxes"]: + page_layout_list.append(layout["layout_bbox"]) + layout_bbox_list.append(page_layout_list) + for dropped_bbox in page["discarded_blocks"]: + page_dropped_list.append(dropped_bbox["bbox"]) + dropped_bbox_list.append(page_dropped_list) + for block in page["para_blocks"]: + bbox = block["bbox"] + if block["type"] == BlockType.Table: + tables.append(bbox) + for nested_block in block["blocks"]: + bbox = nested_block["bbox"] + if nested_block["type"] == BlockType.TableBody: + tables_body.append(bbox) + elif nested_block["type"] == BlockType.TableCaption: + tables_caption.append(bbox) + elif nested_block["type"] == BlockType.TableFootnote: + tables_footnote.append(bbox) + elif block["type"] == BlockType.Image: + imgs.append(bbox) + for nested_block in block["blocks"]: + bbox = nested_block["bbox"] + if nested_block["type"] == BlockType.ImageBody: + imgs_body.append(bbox) + elif nested_block["type"] == BlockType.ImageCaption: + imgs_caption.append(bbox) + elif block["type"] == BlockType.Title: + titles.append(bbox) + elif block["type"] == BlockType.Text: + texts.append(bbox) + elif block["type"] == BlockType.InterlineEquation: + interequations.append(bbox) + tables_list.append(tables) + tables_body_list.append(tables_body) + tables_caption_list.append(tables_caption) + tables_footnote_list.append(tables_footnote) + imgs_list.append(imgs) + imgs_body_list.append(imgs_body) + imgs_caption_list.append(imgs_caption) + titles_list.append(titles) + texts_list.append(texts) + interequations_list.append(interequations) + + pdf_docs = fitz.open("pdf", pdf_bytes) + for i, page in enumerate(pdf_docs): + draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False) + draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True) + draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True) # color ! + draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True) + draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True) + draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True) + draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True) + draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True) + draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True) + draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True) + draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True) + draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True) + + # Save the PDF + pdf_docs.save(f"{out_path}/layout.pdf") + + +def draw_span_bbox(pdf_info, pdf_bytes, out_path): + text_list = [] + inline_equation_list = [] + interline_equation_list = [] + image_list = [] + table_list = [] + dropped_list = [] + next_page_text_list = [] + next_page_inline_equation_list = [] + + def get_span_info(span): + if span["type"] == ContentType.Text: + if span.get(CROSS_PAGE, False): + next_page_text_list.append(span["bbox"]) + else: + page_text_list.append(span["bbox"]) + elif span["type"] == ContentType.InlineEquation: + if span.get(CROSS_PAGE, False): + next_page_inline_equation_list.append(span["bbox"]) + else: + page_inline_equation_list.append(span["bbox"]) + elif span["type"] == ContentType.InterlineEquation: + page_interline_equation_list.append(span["bbox"]) + elif span["type"] == ContentType.Image: + page_image_list.append(span["bbox"]) + elif span["type"] == ContentType.Table: + page_table_list.append(span["bbox"]) + + for page in pdf_info: + page_text_list = [] + page_inline_equation_list = [] + page_interline_equation_list = [] + page_image_list = [] + page_table_list = [] + page_dropped_list = [] + + # 将跨页的span放到移动到下一页的列表中 + if len(next_page_text_list) > 0: + page_text_list.extend(next_page_text_list) + next_page_text_list.clear() + if len(next_page_inline_equation_list) > 0: + page_inline_equation_list.extend(next_page_inline_equation_list) + next_page_inline_equation_list.clear() + + # 构造dropped_list + for block in page["discarded_blocks"]: + if block["type"] == BlockType.Discarded: + for line in block["lines"]: + for span in line["spans"]: + page_dropped_list.append(span["bbox"]) + dropped_list.append(page_dropped_list) + # 构造其余useful_list + for block in page["para_blocks"]: + if block["type"] in [ + BlockType.Text, + BlockType.Title, + BlockType.InterlineEquation, + ]: + for line in block["lines"]: + for span in line["spans"]: + get_span_info(span) + elif block["type"] in [BlockType.Image, BlockType.Table]: + for sub_block in block["blocks"]: + for line in sub_block["lines"]: + for span in line["spans"]: + get_span_info(span) + text_list.append(page_text_list) + inline_equation_list.append(page_inline_equation_list) + interline_equation_list.append(page_interline_equation_list) + image_list.append(page_image_list) + table_list.append(page_table_list) + pdf_docs = fitz.open("pdf", pdf_bytes) + for i, page in enumerate(pdf_docs): + # 获取当前页面的数据 + draw_bbox_without_number(i, text_list, page, [255, 0, 0], False) + draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False) + draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False) + draw_bbox_without_number(i, image_list, page, [255, 204, 0], False) + draw_bbox_without_number(i, table_list, page, [204, 0, 255], False) + draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False) + + # Save the PDF + pdf_docs.save(f"{out_path}/spans.pdf") diff --git a/magic_pdf/libs/drop_reason.py b/magic_pdf/libs/drop_reason.py new file mode 100644 index 0000000000000000000000000000000000000000..9f574fc2a5e0c6025448ed10fc604172cbb4357f --- /dev/null +++ b/magic_pdf/libs/drop_reason.py @@ -0,0 +1,27 @@ + +class DropReason: + TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序 + USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖 + COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持 + TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的 + COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。 + HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片,计算量太大,从而丢弃 + HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图,计算量太大,从而丢弃 + HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷,当前方法下计算量消耗过大 + MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败 + Exception = "_exception" # 解析中发生异常 + ENCRYPTED = "encrypted" # PDF是加密的 + EMPTY_PDF = "total_page=0" # PDF页面总数为0 + NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF,无法直接解析 + DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段 + TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败 + TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败(例如一级、二级、三级标题) + PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败 + PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败 + NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种 + SPECIAL_PDF = "special_pdf" + PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏 + CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面 + NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负 + OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block + \ No newline at end of file diff --git a/magic_pdf/libs/drop_tag.py b/magic_pdf/libs/drop_tag.py new file mode 100644 index 0000000000000000000000000000000000000000..b1c577174abff03322c06de89c63abe8130e9d5c --- /dev/null +++ b/magic_pdf/libs/drop_tag.py @@ -0,0 +1,19 @@ + +COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block" +PAGE_NO = "page-no" # 页码 +CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本 +VERTICAL_TEXT = 'vertical-text' # 垂直文本 +ROTATE_TEXT = 'rotate-text' # 旋转文本 +EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block +ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上 +ON_TABLE_TEXT = 'on-table-text' # 文本在表格上 + + +class DropTag: + PAGE_NUMBER = "page_no" + HEADER = "header" + FOOTER = "footer" + FOOTNOTE = "footnote" + NOT_IN_LAYOUT = "not_in_layout" + SPAN_OVERLAP = "span_overlap" + BLOCK_OVERLAP = "block_overlap" diff --git a/magic_pdf/libs/hash_utils.py b/magic_pdf/libs/hash_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..47b8aea746eb04eeb427b775227692ef6b4d9d29 --- /dev/null +++ b/magic_pdf/libs/hash_utils.py @@ -0,0 +1,15 @@ +import hashlib + + +def compute_md5(file_bytes): + hasher = hashlib.md5() + hasher.update(file_bytes) + return hasher.hexdigest().upper() + + +def compute_sha256(input_string): + hasher = hashlib.sha256() + # 在Python3中,需要将字符串转化为字节对象才能被哈希函数处理 + input_bytes = input_string.encode('utf-8') + hasher.update(input_bytes) + return hasher.hexdigest() diff --git a/magic_pdf/libs/json_compressor.py b/magic_pdf/libs/json_compressor.py new file mode 100644 index 0000000000000000000000000000000000000000..77ef1c876fcae0b34a42355b3edb079bb5dd891b --- /dev/null +++ b/magic_pdf/libs/json_compressor.py @@ -0,0 +1,27 @@ +import json +import brotli +import base64 + +class JsonCompressor: + + @staticmethod + def compress_json(data): + """ + Compress a json object and encode it with base64 + """ + json_str = json.dumps(data) + json_bytes = json_str.encode('utf-8') + compressed = brotli.compress(json_bytes, quality=6) + compressed_str = base64.b64encode(compressed).decode('utf-8') # convert bytes to string + return compressed_str + + @staticmethod + def decompress_json(compressed_str): + """ + Decode the base64 string and decompress the json object + """ + compressed = base64.b64decode(compressed_str.encode('utf-8')) # convert string to bytes + decompressed_bytes = brotli.decompress(compressed) + json_str = decompressed_bytes.decode('utf-8') + data = json.loads(json_str) + return data diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py new file mode 100644 index 0000000000000000000000000000000000000000..bddb5475261ffe371a13e6d4b71cefea2186c1ad --- /dev/null +++ b/magic_pdf/libs/language.py @@ -0,0 +1,24 @@ +import unicodedata +from fast_langdetect import detect_language + + +def detect_lang(text: str) -> str: + if len(text) == 0: + return "" + try: + lang_upper = detect_language(text) + except: + html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) + lang_upper = detect_language(html_no_ctrl_chars) + try: + lang = lang_upper.lower() + except: + lang = "" + return lang + + +if __name__ == '__main__': + print(detect_lang("This is a test.")) + print(detect_lang("This is a test")) + print(detect_lang("这个是中文测试。")) + print(detect_lang("这个是中文测试。")) diff --git a/magic_pdf/libs/markdown_utils.py b/magic_pdf/libs/markdown_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5708b477e2cad08805155ffe7930281e77517cf3 --- /dev/null +++ b/magic_pdf/libs/markdown_utils.py @@ -0,0 +1,31 @@ +import re + + +def escape_special_markdown_char(pymu_blocks): + """ + 转义正文里对markdown语法有特殊意义的字符 + """ + special_chars = ["*", "`", "~", "$"] + for blk in pymu_blocks: + for line in blk['lines']: + for span in line['spans']: + for char in special_chars: + span_text = span['text'] + span_type = span.get("_type", None) + if span_type in ['inline-equation', 'interline-equation']: + continue + elif span_text: + span['text'] = span['text'].replace(char, "\\" + char) + + return pymu_blocks + + +def ocr_escape_special_markdown_char(content): + """ + 转义正文里对markdown语法有特殊意义的字符 + """ + special_chars = ["*", "`", "~", "$"] + for char in special_chars: + content = content.replace(char, "\\" + char) + + return content diff --git a/magic_pdf/libs/math.py b/magic_pdf/libs/math.py new file mode 100644 index 0000000000000000000000000000000000000000..9edbcc7074dfa189a8508eb76366ae31dba4d665 --- /dev/null +++ b/magic_pdf/libs/math.py @@ -0,0 +1,9 @@ +def float_gt(a, b): + if 0.0001 >= abs(a -b): + return False + return a > b + +def float_equal(a, b): + if 0.0001 >= abs(a-b): + return True + return False \ No newline at end of file diff --git a/magic_pdf/libs/nlp_utils.py b/magic_pdf/libs/nlp_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..49a7365b32968690f8a199647cdf266a476c233b --- /dev/null +++ b/magic_pdf/libs/nlp_utils.py @@ -0,0 +1,203 @@ +import re +from os import path + +from collections import Counter + +from loguru import logger + +# from langdetect import detect +import spacy +import en_core_web_sm +import zh_core_web_sm + +from magic_pdf.libs.language import detect_lang + + +class NLPModels: + """ + How to upload local models to s3: + - config aws cli: + doc\SETUP-CLI.md + doc\setup_cli.sh + app\config\__init__.py + - $ cd {local_dir_storing_models} + - $ ls models + en_core_web_sm-3.7.1/ + zh_core_web_sm-3.7.0/ + - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm + - $ aws s3 --profile=p_project_norm ls s3://llm-infra/models/ + PRE en_core_web_sm-3.7.1/ + PRE zh_core_web_sm-3.7.0/ + """ + + def __init__(self): + # if OS is windows, set "TMP_DIR" to "D:/tmp" + + home_dir = path.expanduser("~") + self.default_local_path = path.join(home_dir, ".nlp_models") + self.default_shared_path = "/share/pdf_processor/nlp_models" + self.default_hdfs_path = "hdfs://pdf_processor/nlp_models" + self.default_s3_path = "s3://llm-infra/models" + self.nlp_models = self.nlp_models = { + "en_core_web_sm": { + "type": "spacy", + "version": "3.7.1", + }, + "en_core_web_md": { + "type": "spacy", + "version": "3.7.1", + }, + "en_core_web_lg": { + "type": "spacy", + "version": "3.7.1", + }, + "zh_core_web_sm": { + "type": "spacy", + "version": "3.7.0", + }, + "zh_core_web_md": { + "type": "spacy", + "version": "3.7.0", + }, + "zh_core_web_lg": { + "type": "spacy", + "version": "3.7.0", + }, + } + self.en_core_web_sm_model = en_core_web_sm.load() + self.zh_core_web_sm_model = zh_core_web_sm.load() + + def load_model(self, model_name, model_type, model_version): + if ( + model_name in self.nlp_models + and self.nlp_models[model_name]["type"] == model_type + and self.nlp_models[model_name]["version"] == model_version + ): + return spacy.load(model_name) if spacy.util.is_package(model_name) else None + + else: + logger.error(f"Unsupported model name or version: {model_name} {model_version}") + return None + + def detect_language(self, text, use_langdetect=False): + if len(text) == 0: + return None + if use_langdetect: + # print("use_langdetect") + # print(detect_lang(text)) + # return detect_lang(text) + if detect_lang(text) == "zh": + return "zh" + else: + return "en" + + if not use_langdetect: + en_count = len(re.findall(r"[a-zA-Z]", text)) + cn_count = len(re.findall(r"[\u4e00-\u9fff]", text)) + + if en_count > cn_count: + return "en" + + if cn_count > en_count: + return "zh" + + def detect_entity_catgr_using_nlp(self, text, threshold=0.5): + """ + Detect entity categories using NLP models and return the most frequent entity types. + + Parameters + ---------- + text : str + Text to be processed. + + Returns + ------- + str + The most frequent entity type. + """ + lang = self.detect_language(text, use_langdetect=True) + + if lang == "en": + nlp_model = self.en_core_web_sm_model + elif lang == "zh": + nlp_model = self.zh_core_web_sm_model + else: + # logger.error(f"Unsupported language: {lang}") + return {} + + # Splitting text into smaller parts + text_parts = re.split(r"[,;,;、\s & |]+", text) + + text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)] # Remove non-words + text_combined = " ".join(text_parts) + + try: + doc = nlp_model(text_combined) + entity_counts = Counter([ent.label_ for ent in doc.ents]) + word_counts_in_entities = Counter() + + for ent in doc.ents: + word_counts_in_entities[ent.label_] += len(ent.text.split()) + + total_words_in_entities = sum(word_counts_in_entities.values()) + total_words = len([token for token in doc if not token.is_punct]) + + if total_words_in_entities == 0 or total_words == 0: + return None + + entity_percentage = total_words_in_entities / total_words + if entity_percentage < 0.5: + return None + + most_common_entity, word_count = word_counts_in_entities.most_common(1)[0] + entity_percentage = word_count / total_words_in_entities + + if entity_percentage >= threshold: + return most_common_entity + else: + return None + except Exception as e: + logger.error(f"Error in entity detection: {e}") + return None + + +def __main__(): + nlpModel = NLPModels() + + test_strings = [ + "张三", + "张三, 李四,王五; 赵六", + "John Doe", + "Jane Smith", + "Lee, John", + "John Doe, Jane Smith; Alice Johnson,Bob Lee", + "孙七, Michael Jordan;赵八", + "David Smith Michael O'Connor; Kevin ßáçøñ", + "李雷·韩梅梅, 张三·李四", + "Charles Robert Darwin, Isaac Newton", + "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔", + "John Doe, Jane Smith; Alice Johnson", + "张三, 李四,王五; 赵六", + "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG", + "Rachel Mills & William Barry & Susanne B. Haga", + "Claire Chabut* and Jean-François Bussières", + "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China", + "Changchun", + "china", + "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2", + "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene", + "Synergistic Effect of Supported Nickel Catalyst with", + "Intumescent Flame-Retardants on Flame Retardancy", + "and Thermal Stability of Polypropylene", + ] + + for test in test_strings: + print() + print(f"Original String: {test}") + + result = nlpModel.detect_entity_catgr_using_nlp(test) + print(f"Detected entities: {result}") + + +if __name__ == "__main__": + __main__() diff --git a/magic_pdf/libs/ocr_content_type.py b/magic_pdf/libs/ocr_content_type.py new file mode 100644 index 0000000000000000000000000000000000000000..1886c82a7a2fe167eb4a6930cc03581833db17ad --- /dev/null +++ b/magic_pdf/libs/ocr_content_type.py @@ -0,0 +1,21 @@ +class ContentType: + Image = "image" + Table = "table" + Text = "text" + InlineEquation = "inline_equation" + InterlineEquation = "interline_equation" + +class BlockType: + Image = "image" + ImageBody = "image_body" + ImageCaption = "image_caption" + Table = "table" + TableBody = "table_body" + TableCaption = "table_caption" + TableFootnote = "table_footnote" + Text = "text" + Title = "title" + InterlineEquation = "interline_equation" + Footnote = "footnote" + Discarded = "discarded" + diff --git a/magic_pdf/libs/path_utils.py b/magic_pdf/libs/path_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..15fff01b5a698fbd6b1df11d9608b9ef12ffc715 --- /dev/null +++ b/magic_pdf/libs/path_utils.py @@ -0,0 +1,32 @@ + + +def remove_non_official_s3_args(s3path): + """ + example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json + """ + arr = s3path.split("?") + return arr[0] + +def parse_s3path(s3path: str): + # from s3pathlib import S3Path + # p = S3Path(remove_non_official_s3_args(s3path)) + # return p.bucket, p.key + s3path = remove_non_official_s3_args(s3path).strip() + if s3path.startswith(('s3://', 's3a://')): + prefix, path = s3path.split('://', 1) + bucket_name, key = path.split('/', 1) + return bucket_name, key + elif s3path.startswith('/'): + raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.") + else: + raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.") + + +def parse_s3_range_params(s3path: str): + """ + example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350] + """ + arr = s3path.split("?bytes=") + if len(arr) == 1: + return None + return arr[1].split(",") diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py new file mode 100644 index 0000000000000000000000000000000000000000..3f9dc350c6459261e266a6f9435f96556252b827 --- /dev/null +++ b/magic_pdf/libs/pdf_check.py @@ -0,0 +1,62 @@ +from io import BytesIO +import re +import fitz +import numpy as np +from loguru import logger +from pdfminer.high_level import extract_text + + +def calculate_sample_count(total_page: int): + """ + 根据总页数和采样率计算采样页面的数量。 + """ + select_page_cnt = min(10, total_page) + return select_page_cnt + + +def extract_pages(src_pdf_bytes: bytes): + pdf_docs = fitz.open("pdf", src_pdf_bytes) + total_page = len(pdf_docs) + if total_page == 0: + # 如果PDF没有页面,直接返回空文档 + logger.warning("PDF is empty, return empty document") + return fitz.Document() + select_page_cnt = calculate_sample_count(total_page) + + page_num = np.random.choice(total_page, select_page_cnt, replace=False) + sample_docs = fitz.Document() + try: + for index in page_num: + sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index)) + except Exception as e: + logger.exception(e) + return sample_docs + + +def detect_invalid_chars(src_pdf_bytes: bytes) -> bool: + """" + 检测PDF中是否包含非法字符 + """ + '''pdfminer比较慢,需要先随机抽取10页左右的sample''' + sample_docs = extract_pages(src_pdf_bytes) + sample_pdf_bytes = sample_docs.tobytes() + sample_pdf_file_like_object = BytesIO(sample_pdf_bytes) + text = extract_text(sample_pdf_file_like_object) + text = text.replace("\n", "") + # logger.info(text) + '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)''' + cid_pattern = re.compile(r'\(cid:\d+\)') + matches = cid_pattern.findall(text) + cid_count = len(matches) + cid_len = sum(len(match) for match in matches) + text_len = len(text) + if text_len == 0: + cid_chars_radio = 0 + else: + cid_chars_radio = cid_count/(cid_count + text_len - cid_len) + logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}") + '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档''' + if cid_chars_radio > 0.05: + return False # 乱码文档 + else: + return True # 正常文档 diff --git a/magic_pdf/libs/pdf_image_tools.py b/magic_pdf/libs/pdf_image_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..2e5a57eec61724540e707ae03ea4f2cf37e8d5ae --- /dev/null +++ b/magic_pdf/libs/pdf_image_tools.py @@ -0,0 +1,33 @@ + +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.libs.commons import fitz +from magic_pdf.libs.commons import join_path +from magic_pdf.libs.hash_utils import compute_sha256 + + +def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter): + """ + 从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 + save_path:需要同时支持s3和本地, 图片存放在save_path下,文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。 + """ + # 拼接文件名 + filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}" + + # 老版本返回不带bucket的路径 + img_path = join_path(return_path, filename) if return_path is not None else None + + # 新版本生成平铺路径 + img_hash256_path = f"{compute_sha256(img_path)}.jpg" + + # 将坐标转换为fitz.Rect对象 + rect = fitz.Rect(*bbox) + # 配置缩放倍数为3倍 + zoom = fitz.Matrix(3, 3) + # 截取图片 + pix = page.get_pixmap(clip=rect, matrix=zoom) + + byte_data = pix.tobytes(output='jpeg', jpg_quality=95) + + imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN) + + return img_hash256_path diff --git a/magic_pdf/libs/safe_filename.py b/magic_pdf/libs/safe_filename.py new file mode 100644 index 0000000000000000000000000000000000000000..1076a4bae218e180351ef2ec4692f156e03be1c7 --- /dev/null +++ b/magic_pdf/libs/safe_filename.py @@ -0,0 +1,11 @@ +import os + + +def sanitize_filename(filename, replacement="_"): + if os.name == 'nt': + invalid_chars = '<>:"|?*' + + for char in invalid_chars: + filename = filename.replace(char, replacement) + + return filename diff --git a/magic_pdf/libs/textbase.py b/magic_pdf/libs/textbase.py new file mode 100644 index 0000000000000000000000000000000000000000..bb8875b3878d6fcabb72fd3f3a2a8b6ea74b51b0 --- /dev/null +++ b/magic_pdf/libs/textbase.py @@ -0,0 +1,33 @@ +import math + + +def __inc_dict_val(mp, key, val_inc:int): + if mp.get(key): + mp[key] = mp[key] + val_inc + else: + mp[key] = val_inc + + + +def get_text_block_base_info(block): + """ + 获取这个文本块里的字体的颜色、字号、字体 + 按照正文字数最多的返回 + """ + + counter = {} + + for line in block['lines']: + for span in line['spans']: + color = span['color'] + size = round(span['size'], 2) + font = span['font'] + + txt_len = len(span['text']) + __inc_dict_val(counter, (color, size, font), txt_len) + + + c, s, ft = max(counter, key=counter.get) + + return c, s, ft + \ No newline at end of file diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py new file mode 100644 index 0000000000000000000000000000000000000000..43c4ab0058dcb9e755b966ed8bb2b13fa18f24c5 --- /dev/null +++ b/magic_pdf/libs/version.py @@ -0,0 +1 @@ +__version__ = "0.6.1" diff --git a/magic_pdf/libs/vis_utils.py b/magic_pdf/libs/vis_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..5a4988a7775d9ed863d5a440983b67e30226e9de --- /dev/null +++ b/magic_pdf/libs/vis_utils.py @@ -0,0 +1,308 @@ +from magic_pdf.libs.commons import fitz +import os + + +def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str): + """ + 在page上画出bbox,保存到save_path + """ + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(save_path): + # 打开现有的 PDF 文件 + doc = fitz.open(save_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open('') + + color_map = { + 'image': fitz.pdfcolor["yellow"], + 'text': fitz.pdfcolor['blue'], + "table": fitz.pdfcolor['green'] + } + + for k, v in paras_dict.items(): + page_idx = v['page_idx'] + width = raw_pdf_doc[page_idx].rect.width + height = raw_pdf_doc[page_idx].rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for order, block in enumerate(v['preproc_blocks']): + rect = fitz.Rect(block['bbox']) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for img in v['images']: + # 原始box画上去 + rect = fitz.Rect(img['bbox']) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor['yellow']) + shape.finish() + shape.commit() + + for img in v['image_backup']: + # 原始box画上去 + rect = fitz.Rect(img['bbox']) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['yellow'], fill=None) + shape.finish() + shape.commit() + + for tb in v['droped_text_block']: + # 原始box画上去 + rect = fitz.Rect(tb['bbox']) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4) + shape.finish() + shape.commit() + + # TODO table + for tb in v['tables']: + rect = fitz.Rect(tb['bbox']) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2) + shape.finish() + shape.commit() + + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + if is_new_pdf: + doc.save(save_path) + else: + doc.saveIncr() + doc.close() + + +def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list, expect_drop_bboxes:list, save_path: str, expected_page_id:int): + """ + 以覆盖的方式写个临时的pdf,用于debug + """ + if page_idx!=expected_page_id: + return + + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open('') + + width = raw_pdf_doc[page_idx].rect.width + height = raw_pdf_doc[page_idx].rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in droped_bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in expect_drop_bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=None) + shape.finish() + shape.commit() + + # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12, + # color=(0, 0, 0)) + # shape.finish(color=fitz.pdfcolor['black']) + # shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + + +def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,): + save_path = "./tmp/debug.pdf" + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open('') + + width = page.rect.width + height = page.rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes1: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in bboxes2: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in bboxes3: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=None) + shape.finish() + shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + + + + +def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str): + """ + 在page上画出bbox,保存到save_path + """ + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(pdf_path): + # 打开现有的 PDF 文件 + doc = fitz.open(pdf_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open('') + + for k, v in paras_dict.items(): + page_idx = v['page_idx'] + layouts = v['layout_bboxes'] + page = doc[page_idx] + shape = page.new_shape() + for order, layout in enumerate(layouts): + border_offset = 1 + rect_box = layout['layout_bbox'] + layout_label = layout['layout_label'] + fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None + rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset] + rect = fitz.Rect(*rect_box) + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4) + """ + draw order text on layout box + """ + font_size = 10 + shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0)) + + """画上footer header""" + if header: + shape.draw_rect(fitz.Rect(header)) + shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2) + if footer: + shape.draw_rect(fitz.Rect(footer)) + shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2) + + shape.commit() + + if is_new_pdf: + doc.save(pdf_path) + else: + doc.saveIncr() + doc.close() + + +@DeprecationWarning +def draw_layout_on_page(raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str): + """ + 把layout的box用红色边框花在pdf_path的page_idx上 + """ + def draw(shape, layout, fill_color=fitz.pdfcolor['pink']): + border_offset = 1 + rect_box = layout['layout_bbox'] + layout_label = layout['layout_label'] + sub_layout = layout['sub_layout'] + if len(sub_layout)==0: + fill_color = fill_color if layout_label=='U' else None + rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset] + rect = fitz.Rect(*rect_box) + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2) + # if layout_label=='U': + # bad_boxes = layout.get("bad_boxes", []) + # for bad_box in bad_boxes: + # rect = fitz.Rect(*bad_box) + # shape.draw_rect(rect) + # shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2) + # else: + # rect = fitz.Rect(*rect_box) + # shape.draw_rect(rect) + # shape.finish(color=fitz.pdfcolor['blue']) + + for sub_layout in sub_layout: + draw(shape, sub_layout) + shape.commit() + + + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(pdf_path): + # 打开现有的 PDF 文件 + doc = fitz.open(pdf_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open('') + + page = doc[page_idx] + shape = page.new_shape() + for order, layout in enumerate(page_layout): + draw(shape, layout, fitz.pdfcolor['yellow']) + + # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12, + # color=(0, 0, 0)) + # shape.finish(color=fitz.pdfcolor['black']) + # shape.commit() + + parent_dir = os.path.dirname(pdf_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + if is_new_pdf: + doc.save(pdf_path) + else: + doc.saveIncr() + doc.close() + \ No newline at end of file diff --git a/magic_pdf/model/__init__.py b/magic_pdf/model/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..08653f24e6f81056e2ad82b94f17c7efbd87f714 --- /dev/null +++ b/magic_pdf/model/__init__.py @@ -0,0 +1,2 @@ +__use_inside_model__ = True +__model_mode__ = "full" diff --git a/magic_pdf/model/doc_analyze_by_custom_model.py b/magic_pdf/model/doc_analyze_by_custom_model.py new file mode 100644 index 0000000000000000000000000000000000000000..d6916e74b5bf6f2b81e08e76c3e08108e7e55527 --- /dev/null +++ b/magic_pdf/model/doc_analyze_by_custom_model.py @@ -0,0 +1,118 @@ +import time + +import fitz +import numpy as np +from loguru import logger + +from magic_pdf.libs.config_reader import get_local_models_dir, get_device +from magic_pdf.model.model_list import MODEL +import magic_pdf.model as model_config + + +def dict_compare(d1, d2): + return d1.items() == d2.items() + + +def remove_duplicates_dicts(lst): + unique_dicts = [] + for dict_item in lst: + if not any( + dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts + ): + unique_dicts.append(dict_item) + return unique_dicts + + +def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list: + try: + from PIL import Image + except ImportError: + logger.error("Pillow not installed, please install by pip.") + exit(1) + + images = [] + with fitz.open("pdf", pdf_bytes) as doc: + for index in range(0, doc.page_count): + page = doc[index] + mat = fitz.Matrix(dpi / 72, dpi / 72) + pm = page.get_pixmap(matrix=mat, alpha=False) + + # if width or height > 3000 pixels, don't enlarge the image + if pm.width > 3000 or pm.height > 3000: + pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False) + + img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples) + img = np.array(img) + img_dict = {"img": img, "width": pm.width, "height": pm.height} + images.append(img_dict) + return images + + +class ModelSingleton: + _instance = None + _models = {} + + def __new__(cls, *args, **kwargs): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def get_model(self, ocr: bool, show_log: bool): + key = (ocr, show_log) + if key not in self._models: + self._models[key] = custom_model_init(ocr=ocr, show_log=show_log) + return self._models[key] + + +def custom_model_init(ocr: bool = False, show_log: bool = False): + model = None + + if model_config.__model_mode__ == "lite": + model = MODEL.Paddle + elif model_config.__model_mode__ == "full": + model = MODEL.PEK + + if model_config.__use_inside_model__: + model_init_start = time.time() + if model == MODEL.Paddle: + from magic_pdf.model.pp_structure_v2 import CustomPaddleModel + custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log) + elif model == MODEL.PEK: + from magic_pdf.model.pdf_extract_kit import CustomPEKModel + # 从配置文件读取model-dir和device + local_models_dir = get_local_models_dir() + device = get_device() + custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device) + else: + logger.error("Not allow model_name!") + exit(1) + model_init_cost = time.time() - model_init_start + logger.info(f"model init cost: {model_init_cost}") + else: + logger.error("use_inside_model is False, not allow to use inside model") + exit(1) + + return custom_model + + +def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False): + + model_manager = ModelSingleton() + custom_model = model_manager.get_model(ocr, show_log) + + images = load_images_from_pdf(pdf_bytes) + + model_json = [] + doc_analyze_start = time.time() + for index, img_dict in enumerate(images): + img = img_dict["img"] + page_width = img_dict["width"] + page_height = img_dict["height"] + result = custom_model(img) + page_info = {"page_no": index, "height": page_height, "width": page_width} + page_dict = {"layout_dets": result, "page_info": page_info} + model_json.append(page_dict) + doc_analyze_cost = time.time() - doc_analyze_start + logger.info(f"doc analyze cost: {doc_analyze_cost}") + + return model_json diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py new file mode 100644 index 0000000000000000000000000000000000000000..9c95f21e6ddb37c8698458f5856714c4eabb813e --- /dev/null +++ b/magic_pdf/model/magic_model.py @@ -0,0 +1,636 @@ +import json +import math + +from magic_pdf.libs.commons import fitz +from loguru import logger + +from magic_pdf.libs.commons import join_path +from magic_pdf.libs.coordinate_transform import get_scale_ratio +from magic_pdf.libs.ocr_content_type import ContentType +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter +from magic_pdf.libs.math import float_gt +from magic_pdf.libs.boxbase import ( + _is_in, + bbox_relative_pos, + bbox_distance, + _is_part_overlap, + calculate_overlap_area_in_bbox1_area_ratio, + calculate_iou, +) +from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum + +CAPATION_OVERLAP_AREA_RATIO = 0.6 + + +class MagicModel: + """ + 每个函数没有得到元素的时候返回空list + + """ + + def __fix_axis(self): + for model_page_info in self.__model_list: + need_remove_list = [] + page_no = model_page_info["page_info"]["page_no"] + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio( + model_page_info, self.__docs[page_no] + ) + layout_dets = model_page_info["layout_dets"] + for layout_det in layout_dets: + + if layout_det.get("bbox") is not None: + # 兼容直接输出bbox的模型数据,如paddle + x0, y0, x1, y1 = layout_det["bbox"] + else: + # 兼容直接输出poly的模型数据,如xxx + x0, y0, _, _, x1, y1, _, _ = layout_det["poly"] + + bbox = [ + int(x0 / horizontal_scale_ratio), + int(y0 / vertical_scale_ratio), + int(x1 / horizontal_scale_ratio), + int(y1 / vertical_scale_ratio), + ] + layout_det["bbox"] = bbox + # 删除高度或者宽度小于等于0的spans + if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0: + need_remove_list.append(layout_det) + for need_remove in need_remove_list: + layout_dets.remove(need_remove) + + def __fix_by_remove_low_confidence(self): + for model_page_info in self.__model_list: + need_remove_list = [] + layout_dets = model_page_info["layout_dets"] + for layout_det in layout_dets: + if layout_det["score"] <= 0.05: + need_remove_list.append(layout_det) + else: + continue + for need_remove in need_remove_list: + layout_dets.remove(need_remove) + + def __fix_by_remove_high_iou_and_low_confidence(self): + for model_page_info in self.__model_list: + need_remove_list = [] + layout_dets = model_page_info["layout_dets"] + for layout_det1 in layout_dets: + for layout_det2 in layout_dets: + if layout_det1 == layout_det2: + continue + if layout_det1["category_id"] in [ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + ] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]: + if ( + calculate_iou(layout_det1["bbox"], layout_det2["bbox"]) + > 0.9 + ): + if layout_det1["score"] < layout_det2["score"]: + layout_det_need_remove = layout_det1 + else: + layout_det_need_remove = layout_det2 + + if layout_det_need_remove not in need_remove_list: + need_remove_list.append(layout_det_need_remove) + else: + continue + else: + continue + for need_remove in need_remove_list: + layout_dets.remove(need_remove) + + def __init__(self, model_list: list, docs: fitz.Document): + self.__model_list = model_list + self.__docs = docs + """为所有模型数据添加bbox信息(缩放,poly->bbox)""" + self.__fix_axis() + """删除置信度特别低的模型数据(<0.05),提高质量""" + self.__fix_by_remove_low_confidence() + """删除高iou(>0.9)数据中置信度较低的那个""" + self.__fix_by_remove_high_iou_and_low_confidence() + + def __reduct_overlap(self, bboxes): + N = len(bboxes) + keep = [True] * N + for i in range(N): + for j in range(N): + if i == j: + continue + if _is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]): + keep[i] = False + + return [bboxes[i] for i in range(N) if keep[i]] + + def __tie_up_category_by_distance( + self, page_no, subject_category_id, object_category_id + ): + """ + 假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object),每个 object 只能属于一个 subject + """ + ret = [] + MAX_DIS_OF_POINT = 10**9 + 7 + + # subject 和 object 的 bbox 会合并成一个大的 bbox (named: merged bbox)。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。 + # 再求出筛选出的 subjects 和 object 的最短距离! + def may_find_other_nearest_bbox(subject_idx, object_idx): + ret = float("inf") + + x0 = min( + all_bboxes[subject_idx]["bbox"][0], all_bboxes[object_idx]["bbox"][0] + ) + y0 = min( + all_bboxes[subject_idx]["bbox"][1], all_bboxes[object_idx]["bbox"][1] + ) + x1 = max( + all_bboxes[subject_idx]["bbox"][2], all_bboxes[object_idx]["bbox"][2] + ) + y1 = max( + all_bboxes[subject_idx]["bbox"][3], all_bboxes[object_idx]["bbox"][3] + ) + + object_area = abs( + all_bboxes[object_idx]["bbox"][2] - all_bboxes[object_idx]["bbox"][0] + ) * abs( + all_bboxes[object_idx]["bbox"][3] - all_bboxes[object_idx]["bbox"][1] + ) + + for i in range(len(all_bboxes)): + if ( + i == subject_idx + or all_bboxes[i]["category_id"] != subject_category_id + ): + continue + if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]["bbox"]) or _is_in( + all_bboxes[i]["bbox"], [x0, y0, x1, y1] + ): + + i_area = abs( + all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0] + ) * abs(all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1]) + if i_area >= object_area: + ret = min(float("inf"), dis[i][object_idx]) + + return ret + + def expand_bbbox(idxes): + x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes] + y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes] + x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes] + y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes] + return min(x0s), min(y0s), max(x1s), max(y1s) + + subjects = self.__reduct_overlap( + list( + map( + lambda x: {"bbox": x["bbox"], "score": x["score"]}, + filter( + lambda x: x["category_id"] == subject_category_id, + self.__model_list[page_no]["layout_dets"], + ), + ) + ) + ) + + objects = self.__reduct_overlap( + list( + map( + lambda x: {"bbox": x["bbox"], "score": x["score"]}, + filter( + lambda x: x["category_id"] == object_category_id, + self.__model_list[page_no]["layout_dets"], + ), + ) + ) + ) + subject_object_relation_map = {} + + subjects.sort( + key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2 + ) # get the distance ! + + all_bboxes = [] + + for v in subjects: + all_bboxes.append( + { + "category_id": subject_category_id, + "bbox": v["bbox"], + "score": v["score"], + } + ) + + for v in objects: + all_bboxes.append( + { + "category_id": object_category_id, + "bbox": v["bbox"], + "score": v["score"], + } + ) + + N = len(all_bboxes) + dis = [[MAX_DIS_OF_POINT] * N for _ in range(N)] + + for i in range(N): + for j in range(i): + if ( + all_bboxes[i]["category_id"] == subject_category_id + and all_bboxes[j]["category_id"] == subject_category_id + ): + continue + + dis[i][j] = bbox_distance(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]) + dis[j][i] = dis[i][j] + + used = set() + for i in range(N): + # 求第 i 个 subject 所关联的 object + if all_bboxes[i]["category_id"] != subject_category_id: + continue + seen = set() + candidates = [] + arr = [] + for j in range(N): + + pos_flag_count = sum( + list( + map( + lambda x: 1 if x else 0, + bbox_relative_pos( + all_bboxes[i]["bbox"], all_bboxes[j]["bbox"] + ), + ) + ) + ) + if pos_flag_count > 1: + continue + if ( + all_bboxes[j]["category_id"] != object_category_id + or j in used + or dis[i][j] == MAX_DIS_OF_POINT + ): + continue + left, right, _, _ = bbox_relative_pos( + all_bboxes[i]["bbox"], all_bboxes[j]["bbox"] + ) # 由 pos_flag_count 相关逻辑保证本段逻辑准确性 + if left or right: + one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0] + else: + one_way_dis = all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1] + if dis[i][j] > one_way_dis: + continue + arr.append((dis[i][j], j)) + + arr.sort(key=lambda x: x[0]) + if len(arr) > 0: + # bug: 离该subject 最近的 object 可能跨越了其它的 subject 。比如 [this subect] [some sbuject] [the nearest objec of subject] + if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]: + + candidates.append(arr[0][1]) + seen.add(arr[0][1]) + + # 已经获取初始种子 + for j in set(candidates): + tmp = [] + for k in range(i + 1, N): + pos_flag_count = sum( + list( + map( + lambda x: 1 if x else 0, + bbox_relative_pos( + all_bboxes[j]["bbox"], all_bboxes[k]["bbox"] + ), + ) + ) + ) + + if pos_flag_count > 1: + continue + + if ( + all_bboxes[k]["category_id"] != object_category_id + or k in used + or k in seen + or dis[j][k] == MAX_DIS_OF_POINT + or dis[j][k] > dis[i][j] + ): + continue + + is_nearest = True + for l in range(i + 1, N): + if l in (j, k) or l in used or l in seen: + continue + + if not float_gt(dis[l][k], dis[j][k]): + is_nearest = False + break + + if is_nearest: + nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k]) + n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1]) + if float_gt(dis[i][j], n_dis): + continue + tmp.append(k) + seen.add(k) + + candidates = tmp + if len(candidates) == 0: + break + + # 已经获取到某个 figure 下所有的最靠近的 captions,以及最靠近这些 captions 的 captions 。 + # 先扩一下 bbox, + ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i]) + ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"] + + # 分成了 4 个截取空间,需要计算落在每个截取空间下 objects 合并后占据的矩形面积 + caption_poses = [ + [ox0, oy0, ix0, oy1], + [ox0, oy0, ox1, iy0], + [ox0, iy1, ox1, oy1], + [ix1, oy0, ox1, oy1], + ] + + caption_areas = [] + for bbox in caption_poses: + embed_arr = [] + for idx in seen: + if ( + calculate_overlap_area_in_bbox1_area_ratio( + all_bboxes[idx]["bbox"], bbox + ) + > CAPATION_OVERLAP_AREA_RATIO + ): + embed_arr.append(idx) + + if len(embed_arr) > 0: + embed_x0 = min([all_bboxes[idx]["bbox"][0] for idx in embed_arr]) + embed_y0 = min([all_bboxes[idx]["bbox"][1] for idx in embed_arr]) + embed_x1 = max([all_bboxes[idx]["bbox"][2] for idx in embed_arr]) + embed_y1 = max([all_bboxes[idx]["bbox"][3] for idx in embed_arr]) + caption_areas.append( + int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0)) + ) + else: + caption_areas.append(0) + + subject_object_relation_map[i] = [] + if max(caption_areas) > 0: + max_area_idx = caption_areas.index(max(caption_areas)) + caption_bbox = caption_poses[max_area_idx] + + for j in seen: + if ( + calculate_overlap_area_in_bbox1_area_ratio( + all_bboxes[j]["bbox"], caption_bbox + ) + > CAPATION_OVERLAP_AREA_RATIO + ): + used.add(j) + subject_object_relation_map[i].append(j) + + for i in sorted(subject_object_relation_map.keys()): + result = { + "subject_body": all_bboxes[i]["bbox"], + "all": all_bboxes[i]["bbox"], + "score": all_bboxes[i]["score"], + } + + if len(subject_object_relation_map[i]) > 0: + x0 = min( + [all_bboxes[j]["bbox"][0] for j in subject_object_relation_map[i]] + ) + y0 = min( + [all_bboxes[j]["bbox"][1] for j in subject_object_relation_map[i]] + ) + x1 = max( + [all_bboxes[j]["bbox"][2] for j in subject_object_relation_map[i]] + ) + y1 = max( + [all_bboxes[j]["bbox"][3] for j in subject_object_relation_map[i]] + ) + result["object_body"] = [x0, y0, x1, y1] + result["all"] = [ + min(x0, all_bboxes[i]["bbox"][0]), + min(y0, all_bboxes[i]["bbox"][1]), + max(x1, all_bboxes[i]["bbox"][2]), + max(y1, all_bboxes[i]["bbox"][3]), + ] + ret.append(result) + + total_subject_object_dis = 0 + # 计算已经配对的 distance 距离 + for i in subject_object_relation_map.keys(): + for j in subject_object_relation_map[i]: + total_subject_object_dis += bbox_distance( + all_bboxes[i]["bbox"], all_bboxes[j]["bbox"] + ) + + # 计算未匹配的 subject 和 object 的距离(非精确版) + with_caption_subject = set( + [ + key + for key in subject_object_relation_map.keys() + if len(subject_object_relation_map[i]) > 0 + ] + ) + for i in range(N): + if all_bboxes[i]["category_id"] != object_category_id or i in used: + continue + candidates = [] + for j in range(N): + if ( + all_bboxes[j]["category_id"] != subject_category_id + or j in with_caption_subject + ): + continue + candidates.append((dis[i][j], j)) + if len(candidates) > 0: + candidates.sort(key=lambda x: x[0]) + total_subject_object_dis += candidates[0][1] + with_caption_subject.add(j) + return ret, total_subject_object_dis + + def get_imgs(self, page_no: int): + figure_captions, _ = self.__tie_up_category_by_distance( + page_no, 3, 4 + ) + return [ + { + "bbox": record["all"], + "img_body_bbox": record["subject_body"], + "img_caption_bbox": record.get("object_body", None), + "score": record["score"], + } + for record in figure_captions + ] + + def get_tables( + self, page_no: int + ) -> list: # 3个坐标, caption, table主体,table-note + with_captions, _ = self.__tie_up_category_by_distance(page_no, 5, 6) + with_footnotes, _ = self.__tie_up_category_by_distance(page_no, 5, 7) + ret = [] + N, M = len(with_captions), len(with_footnotes) + assert N == M + for i in range(N): + record = { + "score": with_captions[i]["score"], + "table_caption_bbox": with_captions[i].get("object_body", None), + "table_body_bbox": with_captions[i]["subject_body"], + "table_footnote_bbox": with_footnotes[i].get("object_body", None), + } + + x0 = min(with_captions[i]["all"][0], with_footnotes[i]["all"][0]) + y0 = min(with_captions[i]["all"][1], with_footnotes[i]["all"][1]) + x1 = max(with_captions[i]["all"][2], with_footnotes[i]["all"][2]) + y1 = max(with_captions[i]["all"][3], with_footnotes[i]["all"][3]) + record["bbox"] = [x0, y0, x1, y1] + ret.append(record) + return ret + + def get_equations(self, page_no: int) -> list: # 有坐标,也有字 + inline_equations = self.__get_blocks_by_type( + ModelBlockTypeEnum.EMBEDDING.value, page_no, ["latex"] + ) + interline_equations = self.__get_blocks_by_type( + ModelBlockTypeEnum.ISOLATED.value, page_no, ["latex"] + ) + interline_equations_blocks = self.__get_blocks_by_type( + ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no + ) + return inline_equations, interline_equations, interline_equations_blocks + + def get_discarded(self, page_no: int) -> list: # 自研模型,只有坐标 + blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ABANDON.value, page_no) + return blocks + + def get_text_blocks(self, page_no: int) -> list: # 自研模型搞的,只有坐标,没有字 + blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.PLAIN_TEXT.value, page_no) + return blocks + + def get_title_blocks(self, page_no: int) -> list: # 自研模型,只有坐标,没字 + blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.TITLE.value, page_no) + return blocks + + def get_ocr_text(self, page_no: int) -> list: # paddle 搞的,有字也有坐标 + text_spans = [] + model_page_info = self.__model_list[page_no] + layout_dets = model_page_info["layout_dets"] + for layout_det in layout_dets: + if layout_det["category_id"] == "15": + span = { + "bbox": layout_det["bbox"], + "content": layout_det["text"], + } + text_spans.append(span) + return text_spans + + def get_all_spans(self, page_no: int) -> list: + def remove_duplicate_spans(spans): + new_spans = [] + for span in spans: + if not any(span == existing_span for existing_span in new_spans): + new_spans.append(span) + return new_spans + + all_spans = [] + model_page_info = self.__model_list[page_no] + layout_dets = model_page_info["layout_dets"] + allow_category_id_list = [3, 5, 13, 14, 15] + """当成span拼接的""" + # 3: 'image', # 图片 + # 5: 'table', # 表格 + # 13: 'inline_equation', # 行内公式 + # 14: 'interline_equation', # 行间公式 + # 15: 'text', # ocr识别文本 + for layout_det in layout_dets: + category_id = layout_det["category_id"] + if category_id in allow_category_id_list: + span = {"bbox": layout_det["bbox"], "score": layout_det["score"]} + if category_id == 3: + span["type"] = ContentType.Image + elif category_id == 5: + span["type"] = ContentType.Table + elif category_id == 13: + span["content"] = layout_det["latex"] + span["type"] = ContentType.InlineEquation + elif category_id == 14: + span["content"] = layout_det["latex"] + span["type"] = ContentType.InterlineEquation + elif category_id == 15: + span["content"] = layout_det["text"] + span["type"] = ContentType.Text + all_spans.append(span) + return remove_duplicate_spans(all_spans) + + def get_page_size(self, page_no: int): # 获取页面宽高 + # 获取当前页的page对象 + page = self.__docs[page_no] + # 获取当前页的宽高 + page_w = page.rect.width + page_h = page.rect.height + return page_w, page_h + + def __get_blocks_by_type( + self, type: int, page_no: int, extra_col: list[str] = [] + ) -> list: + blocks = [] + for page_dict in self.__model_list: + layout_dets = page_dict.get("layout_dets", []) + page_info = page_dict.get("page_info", {}) + page_number = page_info.get("page_no", -1) + if page_no != page_number: + continue + for item in layout_dets: + category_id = item.get("category_id", -1) + bbox = item.get("bbox", None) + + if category_id == type: + block = { + "bbox": bbox, + "score": item.get("score"), + } + for col in extra_col: + block[col] = item.get(col, None) + blocks.append(block) + return blocks + + def get_model_list(self, page_no): + return self.__model_list[page_no] + + +if __name__ == "__main__": + drw = DiskReaderWriter(r"D:/project/20231108code-clean") + if 0: + pdf_file_path = r"linshixuqiu\19983-00.pdf" + model_file_path = r"linshixuqiu\19983-00_new.json" + pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN) + model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT) + model_list = json.loads(model_json_txt) + write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00" + img_bucket_path = "imgs" + img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path)) + pdf_docs = fitz.open("pdf", pdf_bytes) + magic_model = MagicModel(model_list, pdf_docs) + + if 1: + model_list = json.loads( + drw.read("/opt/data/pdf/20240418/j.chroma.2009.03.042.json") + ) + pdf_bytes = drw.read( + "/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf", AbsReaderWriter.MODE_BIN + ) + pdf_docs = fitz.open("pdf", pdf_bytes) + magic_model = MagicModel(model_list, pdf_docs) + for i in range(7): + print(magic_model.get_imgs(i)) diff --git a/magic_pdf/model/model_list.py b/magic_pdf/model/model_list.py new file mode 100644 index 0000000000000000000000000000000000000000..1b4c589b306c097a251d4185f800a15a6b11ca53 --- /dev/null +++ b/magic_pdf/model/model_list.py @@ -0,0 +1,3 @@ +class MODEL: + Paddle = "pp_structure_v2" + PEK = "pdf_extract_kit" diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py new file mode 100644 index 0000000000000000000000000000000000000000..76a1243b2fe476ce81c3ccbeda877e4eb40ed140 --- /dev/null +++ b/magic_pdf/model/pdf_extract_kit.py @@ -0,0 +1,200 @@ +from loguru import logger +import os +try: + import cv2 + import yaml + import time + import argparse + import numpy as np + import torch + + from paddleocr import draw_ocr + from PIL import Image + from torchvision import transforms + from torch.utils.data import Dataset, DataLoader + from ultralytics import YOLO + from unimernet.common.config import Config + import unimernet.tasks as tasks + from unimernet.processors import load_processor + + from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor + from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace + from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR +except ImportError: + logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"') + exit(1) + + +def mfd_model_init(weight): + mfd_model = YOLO(weight) + return mfd_model + + +def mfr_model_init(weight_dir, cfg_path, _device_='cpu'): + args = argparse.Namespace(cfg_path=cfg_path, options=None) + cfg = Config(args) + cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin") + cfg.config.model.model_config.model_name = weight_dir + cfg.config.model.tokenizer_config.path = weight_dir + task = tasks.setup_task(cfg) + model = task.build_model(cfg) + model = model.to(_device_) + vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval) + return model, vis_processor + + +def layout_model_init(weight, config_file, device): + model = Layoutlmv3_Predictor(weight, config_file, device) + return model + + +class MathDataset(Dataset): + def __init__(self, image_paths, transform=None): + self.image_paths = image_paths + self.transform = transform + + def __len__(self): + return len(self.image_paths) + + def __getitem__(self, idx): + # if not pil image, then convert to pil image + if isinstance(self.image_paths[idx], str): + raw_image = Image.open(self.image_paths[idx]) + else: + raw_image = self.image_paths[idx] + if self.transform: + image = self.transform(raw_image) + return image + + +class CustomPEKModel: + + def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs): + """ + ======== model init ======== + """ + # 获取当前文件(即 pdf_extract_kit.py)的绝对路径 + current_file_path = os.path.abspath(__file__) + # 获取当前文件所在的目录(model) + current_dir = os.path.dirname(current_file_path) + # 上一级目录(magic_pdf) + root_dir = os.path.dirname(current_dir) + # model_config目录 + model_config_dir = os.path.join(root_dir, 'resources', 'model_config') + # 构建 model_configs.yaml 文件的完整路径 + config_path = os.path.join(model_config_dir, 'model_configs.yaml') + with open(config_path, "r") as f: + self.configs = yaml.load(f, Loader=yaml.FullLoader) + # 初始化解析配置 + self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"]) + self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"]) + self.apply_ocr = ocr + logger.info( + "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format( + self.apply_layout, self.apply_formula, self.apply_ocr + ) + ) + assert self.apply_layout, "DocAnalysis must contain layout model." + # 初始化解析方案 + self.device = kwargs.get("device", self.configs["config"]["device"]) + logger.info("using device: {}".format(self.device)) + models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models")) + + # 初始化公式识别 + if self.apply_formula: + # 初始化公式检测模型 + self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"]))) + + # 初始化公式解析模型 + mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"])) + mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml")) + self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device) + self.mfr_transform = transforms.Compose([mfr_vis_processors, ]) + + # 初始化layout模型 + self.layout_model = Layoutlmv3_Predictor( + str(os.path.join(models_dir, self.configs['weights']['layout'])), + str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")), + device=self.device + ) + # 初始化ocr + if self.apply_ocr: + self.ocr_model = ModifiedPaddleOCR(show_log=show_log) + + logger.info('DocAnalysis init done!') + + def __call__(self, image): + + latex_filling_list = [] + mf_image_list = [] + + # layout检测 + layout_start = time.time() + layout_res = self.layout_model(image, ignore_catids=[]) + layout_cost = round(time.time() - layout_start, 2) + logger.info(f"layout detection cost: {layout_cost}") + + # 公式检测 + mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0] + for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()): + xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy] + new_item = { + 'category_id': 13 + int(cla.item()), + 'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax], + 'score': round(float(conf.item()), 2), + 'latex': '', + } + layout_res.append(new_item) + latex_filling_list.append(new_item) + bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax]) + mf_image_list.append(bbox_img) + + # 公式识别 + mfr_start = time.time() + dataset = MathDataset(mf_image_list, transform=self.mfr_transform) + dataloader = DataLoader(dataset, batch_size=64, num_workers=0) + mfr_res = [] + for mf_img in dataloader: + mf_img = mf_img.to(self.device) + output = self.mfr_model.generate({'image': mf_img}) + mfr_res.extend(output['pred_str']) + for res, latex in zip(latex_filling_list, mfr_res): + res['latex'] = latex_rm_whitespace(latex) + mfr_cost = round(time.time() - mfr_start, 2) + logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}") + + # ocr识别 + if self.apply_ocr: + ocr_start = time.time() + pil_img = Image.fromarray(image) + single_page_mfdetrec_res = [] + for res in layout_res: + if int(res['category_id']) in [13, 14]: + xmin, ymin = int(res['poly'][0]), int(res['poly'][1]) + xmax, ymax = int(res['poly'][4]), int(res['poly'][5]) + single_page_mfdetrec_res.append({ + "bbox": [xmin, ymin, xmax, ymax], + }) + for res in layout_res: + if int(res['category_id']) in [0, 1, 2, 4, 6, 7]: # 需要进行ocr的类别 + xmin, ymin = int(res['poly'][0]), int(res['poly'][1]) + xmax, ymax = int(res['poly'][4]), int(res['poly'][5]) + crop_box = (xmin, ymin, xmax, ymax) + cropped_img = Image.new('RGB', pil_img.size, 'white') + cropped_img.paste(pil_img.crop(crop_box), crop_box) + cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR) + ocr_res = self.ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0] + if ocr_res: + for box_ocr_res in ocr_res: + p1, p2, p3, p4 = box_ocr_res[0] + text, score = box_ocr_res[1] + layout_res.append({ + 'category_id': 15, + 'poly': p1 + p2 + p3 + p4, + 'score': round(score, 2), + 'text': text, + }) + ocr_cost = round(time.time() - ocr_start, 2) + logger.info(f"ocr cost: {ocr_cost}") + + return layout_res diff --git a/magic_pdf/model/pek_sub_modules/__init__.py b/magic_pdf/model/pek_sub_modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..5364f862e78205c65ffe3fdeba6aef09da148c39 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py @@ -0,0 +1,179 @@ +# -------------------------------------------------------------------------------- +# VIT: Multi-Path Vision Transformer for Dense Prediction +# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI). +# All Rights Reserved. +# Written by Youngwan Lee +# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------------------------------- +# References: +# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm +# CoaT: https://github.com/mlpc-ucsd/CoaT +# -------------------------------------------------------------------------------- + + +import torch + +from detectron2.layers import ( + ShapeSpec, +) +from detectron2.modeling import Backbone, BACKBONE_REGISTRY, FPN +from detectron2.modeling.backbone.fpn import LastLevelP6P7, LastLevelMaxPool + +from .beit import beit_base_patch16, dit_base_patch16, dit_large_patch16, beit_large_patch16 +from .deit import deit_base_patch16, mae_base_patch16 +from .layoutlmft.models.layoutlmv3 import LayoutLMv3Model +from transformers import AutoConfig + +__all__ = [ + "build_vit_fpn_backbone", +] + + +class VIT_Backbone(Backbone): + """ + Implement VIT backbone. + """ + + def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwargs, + config_path=None, image_only=False, cfg=None): + super().__init__() + self._out_features = out_features + if 'base' in name: + self._out_feature_strides = {"layer3": 4, "layer5": 8, "layer7": 16, "layer11": 32} + self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768} + else: + self._out_feature_strides = {"layer7": 4, "layer11": 8, "layer15": 16, "layer23": 32} + self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024} + + if name == 'beit_base_patch16': + model_func = beit_base_patch16 + elif name == 'dit_base_patch16': + model_func = dit_base_patch16 + elif name == "deit_base_patch16": + model_func = deit_base_patch16 + elif name == "mae_base_patch16": + model_func = mae_base_patch16 + elif name == "dit_large_patch16": + model_func = dit_large_patch16 + elif name == "beit_large_patch16": + model_func = beit_large_patch16 + + if 'beit' in name or 'dit' in name: + if pos_type == "abs": + self.backbone = model_func(img_size=img_size, + out_features=out_features, + drop_path_rate=drop_path, + use_abs_pos_emb=True, + **model_kwargs) + elif pos_type == "shared_rel": + self.backbone = model_func(img_size=img_size, + out_features=out_features, + drop_path_rate=drop_path, + use_shared_rel_pos_bias=True, + **model_kwargs) + elif pos_type == "rel": + self.backbone = model_func(img_size=img_size, + out_features=out_features, + drop_path_rate=drop_path, + use_rel_pos_bias=True, + **model_kwargs) + else: + raise ValueError() + elif "layoutlmv3" in name: + config = AutoConfig.from_pretrained(config_path) + # disable relative bias as DiT + config.has_spatial_attention_bias = False + config.has_relative_attention_bias = False + self.backbone = LayoutLMv3Model(config, detection=True, + out_features=out_features, image_only=image_only) + else: + self.backbone = model_func(img_size=img_size, + out_features=out_features, + drop_path_rate=drop_path, + **model_kwargs) + self.name = name + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + + Returns: + dict[str->Tensor]: names and the corresponding features + """ + if "layoutlmv3" in self.name: + return self.backbone.forward( + input_ids=x["input_ids"] if "input_ids" in x else None, + bbox=x["bbox"] if "bbox" in x else None, + images=x["images"] if "images" in x else None, + attention_mask=x["attention_mask"] if "attention_mask" in x else None, + # output_hidden_states=True, + ) + assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!" + return self.backbone.forward_features(x) + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + +def build_VIT_backbone(cfg): + """ + Create a VIT instance from config. + + Args: + cfg: a detectron2 CfgNode + + Returns: + A VIT backbone instance. + """ + # fmt: off + name = cfg.MODEL.VIT.NAME + out_features = cfg.MODEL.VIT.OUT_FEATURES + drop_path = cfg.MODEL.VIT.DROP_PATH + img_size = cfg.MODEL.VIT.IMG_SIZE + pos_type = cfg.MODEL.VIT.POS_TYPE + + model_kwargs = eval(str(cfg.MODEL.VIT.MODEL_KWARGS).replace("`", "")) + + if 'layoutlmv3' in name: + if cfg.MODEL.CONFIG_PATH != '': + config_path = cfg.MODEL.CONFIG_PATH + else: + config_path = cfg.MODEL.WEIGHTS.replace('pytorch_model.bin', '') # layoutlmv3 pre-trained models + config_path = config_path.replace('model_final.pth', '') # detection fine-tuned models + else: + config_path = None + + return VIT_Backbone(name, out_features, drop_path, img_size, pos_type, model_kwargs, + config_path=config_path, image_only=cfg.MODEL.IMAGE_ONLY, cfg=cfg) + + +@BACKBONE_REGISTRY.register() +def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec): + """ + Create a VIT w/ FPN backbone. + + Args: + cfg: a detectron2 CfgNode + + Returns: + backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. + """ + bottom_up = build_VIT_backbone(cfg) + in_features = cfg.MODEL.FPN.IN_FEATURES + out_channels = cfg.MODEL.FPN.OUT_CHANNELS + backbone = FPN( + bottom_up=bottom_up, + in_features=in_features, + out_channels=out_channels, + norm=cfg.MODEL.FPN.NORM, + top_block=LastLevelMaxPool(), + fuse_type=cfg.MODEL.FPN.FUSE_TYPE, + ) + return backbone diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py new file mode 100644 index 0000000000000000000000000000000000000000..03d4fabdc7816f19a8810e3c443643bc9e53e6b9 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py @@ -0,0 +1,671 @@ +""" Vision Transformer (ViT) in PyTorch + +A PyTorch implement of Vision Transformers as described in +'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929 + +The official jax code is released and available at https://github.com/google-research/vision_transformer + +Status/TODO: +* Models updated to be compatible with official impl. Args added to support backward compat for old PyTorch weights. +* Weights ported from official jax impl for 384x384 base and small models, 16x16 and 32x32 patches. +* Trained (supervised on ImageNet-1k) my custom 'small' patch model to 77.9, 'base' to 79.4 top-1 with this code. +* Hopefully find time and GPUs for SSL or unsupervised pretraining on OpenImages w/ ImageNet fine-tune in future. + +Acknowledgments: +* The paper authors for releasing code and weights, thanks! +* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out +for some einops/einsum fun +* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT +* Bert reference code checks against Huggingface Transformers and Tensorflow Bert + +Hacked together by / Copyright 2020 Ross Wightman +""" +import warnings +import math +import torch +from functools import partial +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import drop_path, to_2tuple, trunc_normal_ + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + **kwargs + } + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + # x = self.drop(x) + # commit this for the orignal BERT implement + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.0) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, rel_pos_bias=None, training_window_size=None): + B, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)) + # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + if training_window_size == self.window_size: + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + else: + training_window_size = tuple(training_window_size.tolist()) + new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3 + # new_num_relative_dis 为 所有可能的相对位置选项,包含cls-cls,tok-cls,与cls-tok + new_relative_position_bias_table = F.interpolate( + self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads, + 2 * self.window_size[0] - 1, + 2 * self.window_size[1] - 1), + size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic', + align_corners=False) + new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads, + new_num_relative_distance - 3).permute( + 1, 0) + new_relative_position_bias_table = torch.cat( + [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(training_window_size[0]) + coords_w = torch.arange(training_window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += training_window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += training_window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2, + dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = new_num_relative_distance - 3 + relative_position_index[0:, 0] = new_num_relative_distance - 2 + relative_position_index[0, 0] = new_num_relative_distance - 1 + + relative_position_bias = \ + new_relative_position_bias_table[relative_position_index.view(-1)].view( + training_window_size[0] * training_window_size[1] + 1, + training_window_size[0] * training_window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if rel_pos_bias is not None: + attn = attn + rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + window_size=None, attn_head_dim=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, + attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if init_values is not None: + self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) + self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, rel_pos_bias=None, training_window_size=None): + if self.gamma_1 is None: + x = x + self.drop_path( + self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, training_window_size=training_window_size)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, + training_window_size=training_window_size)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches_w = self.patch_shape[0] + self.num_patches_h = self.patch_shape[1] + # the so-called patch_shape is the patch shape during pre-training + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x, position_embedding=None, **kwargs): + # FIXME look at relaxing size constraints + # assert H == self.img_size[0] and W == self.img_size[1], \ + # f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + + if position_embedding is not None: + # interpolate the position embedding to the corresponding size + position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, + 1, 2) + position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic') + x = x + position_embedding + + x = x.flatten(2).transpose(1, 2) + return x, (Hp, Wp) + + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + + def __init__(self, backbone, img_size=[224, 224], feature_size=None, in_chans=3, embed_dim=768): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature + # map for all networks, the feature metadata has reliable channel and stride info, but using + # stride to calc feature dim requires info about padding of each stage that isn't captured. + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +class RelativePositionBias(nn.Module): + + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.num_heads = num_heads + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(window_size[0]) + coords_w = torch.arange(window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = self.num_relative_distance - 3 + relative_position_index[0:, 0] = self.num_relative_distance - 2 + relative_position_index[0, 0] = self.num_relative_distance - 1 + + self.register_buffer("relative_position_index", relative_position_index) + + # trunc_normal_(self.relative_position_bias_table, std=.02) + + def forward(self, training_window_size): + if training_window_size == self.window_size: + relative_position_bias = \ + self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + else: + training_window_size = tuple(training_window_size.tolist()) + new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3 + # new_num_relative_dis 为 所有可能的相对位置选项,包含cls-cls,tok-cls,与cls-tok + new_relative_position_bias_table = F.interpolate( + self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads, + 2 * self.window_size[0] - 1, + 2 * self.window_size[1] - 1), + size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic', + align_corners=False) + new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads, + new_num_relative_distance - 3).permute( + 1, 0) + new_relative_position_bias_table = torch.cat( + [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(training_window_size[0]) + coords_w = torch.arange(training_window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += training_window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += training_window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1 + relative_position_index = \ + torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2, + dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = new_num_relative_distance - 3 + relative_position_index[0:, 0] = new_num_relative_distance - 2 + relative_position_index[0, 0] = new_num_relative_distance - 1 + + relative_position_bias = \ + new_relative_position_bias_table[relative_position_index.view(-1)].view( + training_window_size[0] * training_window_size[1] + 1, + training_window_size[0] * training_window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + return relative_position_bias + + +class BEiT(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__(self, + img_size=[224, 224], + patch_size=16, + in_chans=3, + num_classes=80, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + hybrid_backbone=None, + norm_layer=None, + init_values=None, + use_abs_pos_emb=False, + use_rel_pos_bias=False, + use_shared_rel_pos_bias=False, + use_checkpoint=True, + pretrained=None, + out_features=None, + ): + + super(BEiT, self).__init__() + + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.use_checkpoint = use_checkpoint + + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + self.out_features = out_features + self.out_indices = [int(name[5:]) for name in out_features] + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + if use_abs_pos_emb: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + else: + self.pos_embed = None + self.pos_drop = nn.Dropout(p=drop_rate) + + self.use_shared_rel_pos_bias = use_shared_rel_pos_bias + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.use_rel_pos_bias = use_rel_pos_bias + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None) + for i in range(depth)]) + + # trunc_normal_(self.mask_token, std=.02) + + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + # nn.SyncBatchNorm(embed_dim), + nn.BatchNorm2d(embed_dim), + nn.GELU(), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn3 = nn.Identity() + + self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Identity() + + self.fpn3 = nn.Sequential( + nn.MaxPool2d(kernel_size=2, stride=2), + ) + + self.fpn4 = nn.Sequential( + nn.MaxPool2d(kernel_size=4, stride=4), + ) + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + ''' + def init_weights(self): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + logger = get_root_logger() + + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + self.fix_init_weight() + + if self.init_cfg is None: + logger.warn(f'No pre-trained weights for ' + f'{self.__class__.__name__}, ' + f'training start from scratch') + else: + assert 'checkpoint' in self.init_cfg, f'Only support ' \ + f'specify `Pretrained` in ' \ + f'`init_cfg` in ' \ + f'{self.__class__.__name__} ' + logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}") + load_checkpoint(self, + filename=self.init_cfg['checkpoint'], + strict=False, + logger=logger, + beit_spec_expand_rel_pos = self.use_rel_pos_bias, + ) + ''' + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward_features(self, x): + B, C, H, W = x.shape + x, (Hp, Wp) = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None) + # Hp, Wp are HW for patches + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + if self.pos_embed is not None: + cls_tokens = cls_tokens + self.pos_embed[:, :1, :] + x = torch.cat((cls_tokens, x), dim=1) + x = self.pos_drop(x) + + features = [] + training_window_size = torch.tensor([Hp, Wp]) + + rel_pos_bias = self.rel_pos_bias(training_window_size) if self.rel_pos_bias is not None else None + + for i, blk in enumerate(self.blocks): + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, rel_pos_bias, training_window_size) + else: + x = blk(x, rel_pos_bias=rel_pos_bias, training_window_size=training_window_size) + if i in self.out_indices: + xp = x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp) + features.append(xp.contiguous()) + + ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(features)): + features[i] = ops[i](features[i]) + + feat_out = {} + + for name, value in zip(self.out_features, features): + feat_out[name] = value + + return feat_out + + def forward(self, x): + x = self.forward_features(x) + return x + + +def beit_base_patch16(pretrained=False, **kwargs): + model = BEiT( + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=None, + **kwargs) + model.default_cfg = _cfg() + return model + +def beit_large_patch16(pretrained=False, **kwargs): + model = BEiT( + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=None, + **kwargs) + model.default_cfg = _cfg() + return model + +def dit_base_patch16(pretrained=False, **kwargs): + model = BEiT( + patch_size=16, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=0.1, + **kwargs) + model.default_cfg = _cfg() + return model + +def dit_large_patch16(pretrained=False, **kwargs): + model = BEiT( + patch_size=16, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + qkv_bias=True, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=1e-5, + **kwargs) + model.default_cfg = _cfg() + return model + +if __name__ == '__main__': + model = BEiT(use_checkpoint=True, use_shared_rel_pos_bias=True) + model = model.to("cuda:0") + input1 = torch.rand(2, 3, 512, 762).to("cuda:0") + input2 = torch.rand(2, 3, 800, 1200).to("cuda:0") + input3 = torch.rand(2, 3, 720, 1000).to("cuda:0") + output1 = model(input1) + output2 = model(input2) + output3 = model(input3) + print("all done") diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py new file mode 100644 index 0000000000000000000000000000000000000000..9a13bb0a8514df29fb4b0ec58c3726ba9c221a8a --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py @@ -0,0 +1,476 @@ +""" +Mostly copy-paste from DINO and timm library: +https://github.com/facebookresearch/dino +https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +""" +import warnings + +import math +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from timm.models.layers import trunc_normal_, drop_path, to_2tuple +from functools import partial + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + **kwargs + } + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return 'p={}'.format(self.drop_prob) + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + q, k, v = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, + act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + self.window_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + + self.num_patches_w, self.num_patches_h = self.window_size + + self.num_patches = self.window_size[0] * self.window_size[1] + self.img_size = img_size + self.patch_size = patch_size + + self.proj = nn.Conv2d(in_chans, embed_dim, + kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + x = self.proj(x) + return x + + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + + def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + self.img_size = img_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature + # map for all networks, the feature metadata has reliable channel and stride info, but using + # stride to calc feature dim requires info about padding of each stage that isn't captured. + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros( + 1, in_chans, img_size[0], img_size[1]))[-1] + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + feature_dim = self.backbone.feature_info.channels()[-1] + self.num_patches = feature_size[0] * feature_size[1] + self.proj = nn.Linear(feature_dim, embed_dim) + + def forward(self, x): + x = self.backbone(x)[-1] + x = x.flatten(2).transpose(1, 2) + x = self.proj(x) + return x + + +class ViT(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__(self, + model_name='vit_base_patch16_224', + img_size=384, + patch_size=16, + in_chans=3, + embed_dim=1024, + depth=24, + num_heads=16, + num_classes=19, + mlp_ratio=4., + qkv_bias=True, + qk_scale=None, + drop_rate=0.1, + attn_drop_rate=0., + drop_path_rate=0., + hybrid_backbone=None, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + norm_cfg=None, + pos_embed_interp=False, + random_init=False, + align_corners=False, + use_checkpoint=False, + num_extra_tokens=1, + out_features=None, + **kwargs, + ): + + super(ViT, self).__init__() + self.model_name = model_name + self.img_size = img_size + self.patch_size = patch_size + self.in_chans = in_chans + self.embed_dim = embed_dim + self.depth = depth + self.num_heads = num_heads + self.num_classes = num_classes + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.hybrid_backbone = hybrid_backbone + self.norm_layer = norm_layer + self.norm_cfg = norm_cfg + self.pos_embed_interp = pos_embed_interp + self.random_init = random_init + self.align_corners = align_corners + self.use_checkpoint = use_checkpoint + self.num_extra_tokens = num_extra_tokens + self.out_features = out_features + self.out_indices = [int(name[5:]) for name in out_features] + + # self.num_stages = self.depth + # self.out_indices = tuple(range(self.num_stages)) + + if self.hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + self.hybrid_backbone, img_size=self.img_size, in_chans=self.in_chans, embed_dim=self.embed_dim) + else: + self.patch_embed = PatchEmbed( + img_size=self.img_size, patch_size=self.patch_size, in_chans=self.in_chans, embed_dim=self.embed_dim) + self.num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + + if self.num_extra_tokens == 2: + self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + + self.pos_embed = nn.Parameter(torch.zeros( + 1, self.num_patches + self.num_extra_tokens, self.embed_dim)) + self.pos_drop = nn.Dropout(p=self.drop_rate) + + # self.num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches + dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate, + self.depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias, + qk_scale=self.qk_scale, + drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer) + for i in range(self.depth)]) + + # NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here + # self.repr = nn.Linear(embed_dim, representation_size) + # self.repr_act = nn.Tanh() + + if patch_size == 16: + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + nn.SyncBatchNorm(embed_dim), + nn.GELU(), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn3 = nn.Identity() + + self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) + elif patch_size == 8: + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Identity() + + self.fpn3 = nn.Sequential( + nn.MaxPool2d(kernel_size=2, stride=2), + ) + + self.fpn4 = nn.Sequential( + nn.MaxPool2d(kernel_size=4, stride=4), + ) + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + if self.num_extra_tokens==2: + trunc_normal_(self.dist_token, std=0.2) + self.apply(self._init_weights) + # self.fix_init_weight() + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + ''' + def init_weights(self): + logger = get_root_logger() + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + if self.init_cfg is None: + logger.warn(f'No pre-trained weights for ' + f'{self.__class__.__name__}, ' + f'training start from scratch') + else: + assert 'checkpoint' in self.init_cfg, f'Only support ' \ + f'specify `Pretrained` in ' \ + f'`init_cfg` in ' \ + f'{self.__class__.__name__} ' + logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}") + load_checkpoint(self, filename=self.init_cfg['checkpoint'], strict=False, logger=logger) + ''' + + def get_num_layers(self): + return len(self.blocks) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def _conv_filter(self, state_dict, patch_size=16): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + for k, v in state_dict.items(): + if 'patch_embed.proj.weight' in k: + v = v.reshape((v.shape[0], 3, patch_size, patch_size)) + out_dict[k] = v + return out_dict + + def to_2D(self, x): + n, hw, c = x.shape + h = w = int(math.sqrt(hw)) + x = x.transpose(1, 2).reshape(n, c, h, w) + return x + + def to_1D(self, x): + n, c, h, w = x.shape + x = x.reshape(n, c, -1).transpose(1, 2) + return x + + def interpolate_pos_encoding(self, x, w, h): + npatch = x.shape[1] - self.num_extra_tokens + N = self.pos_embed.shape[1] - self.num_extra_tokens + if npatch == N and w == h: + return self.pos_embed + + class_ORdist_pos_embed = self.pos_embed[:, 0:self.num_extra_tokens] + + patch_pos_embed = self.pos_embed[:, self.num_extra_tokens:] + + dim = x.shape[-1] + w0 = w // self.patch_embed.patch_size[0] + h0 = h // self.patch_embed.patch_size[1] + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode='bicubic', + ) + assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_ORdist_pos_embed, patch_pos_embed), dim=1) + + def prepare_tokens(self, x, mask=None): + B, nc, w, h = x.shape + # patch linear embedding + x = self.patch_embed(x) + + # mask image modeling + if mask is not None: + x = self.mask_model(x, mask) + x = x.flatten(2).transpose(1, 2) + + # add the [CLS] token to the embed patch tokens + all_tokens = [self.cls_token.expand(B, -1, -1)] + + if self.num_extra_tokens == 2: + dist_tokens = self.dist_token.expand(B, -1, -1) + all_tokens.append(dist_tokens) + all_tokens.append(x) + + x = torch.cat(all_tokens, dim=1) + + # add positional encoding to each token + x = x + self.interpolate_pos_encoding(x, w, h) + + return self.pos_drop(x) + + def forward_features(self, x): + # print(f"==========shape of x is {x.shape}==========") + B, _, H, W = x.shape + Hp, Wp = H // self.patch_size, W // self.patch_size + x = self.prepare_tokens(x) + + features = [] + for i, blk in enumerate(self.blocks): + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if i in self.out_indices: + xp = x[:, self.num_extra_tokens:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp) + features.append(xp.contiguous()) + + ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(features)): + features[i] = ops[i](features[i]) + + feat_out = {} + + for name, value in zip(self.out_features, features): + feat_out[name] = value + + return feat_out + + def forward(self, x): + x = self.forward_features(x) + return x + + +def deit_base_patch16(pretrained=False, **kwargs): + model = ViT( + patch_size=16, + drop_rate=0., + embed_dim=768, + depth=12, + num_heads=12, + num_classes=1000, + mlp_ratio=4., + qkv_bias=True, + use_checkpoint=True, + num_extra_tokens=2, + **kwargs) + model.default_cfg = _cfg() + return model + +def mae_base_patch16(pretrained=False, **kwargs): + model = ViT( + patch_size=16, + drop_rate=0., + embed_dim=768, + depth=12, + num_heads=12, + num_classes=1000, + mlp_ratio=4., + qkv_bias=True, + use_checkpoint=True, + num_extra_tokens=1, + **kwargs) + model.default_cfg = _cfg() + return model \ No newline at end of file diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cd997b55f3118a01f5d49ae2f080525c7d7c9534 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py @@ -0,0 +1,7 @@ +from .models import ( + LayoutLMv3Config, + LayoutLMv3ForTokenClassification, + LayoutLMv3ForQuestionAnswering, + LayoutLMv3ForSequenceClassification, + LayoutLMv3Tokenizer, +) diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5bcec6c7c65b7add5c3440f106b8f1049781167a --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from .data_collator import DataCollatorForKeyValueExtraction diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py new file mode 100644 index 0000000000000000000000000000000000000000..820dc53a4ea8bc79ddac2d36b57ea2110e8d27d5 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py @@ -0,0 +1,171 @@ +''' +Reference: https://huggingface.co/datasets/pierresi/cord/blob/main/cord.py +''' + + +import json +import os +from pathlib import Path +import datasets +from .image_utils import load_image, normalize_bbox +logger = datasets.logging.get_logger(__name__) +_CITATION = """\ +@article{park2019cord, + title={CORD: A Consolidated Receipt Dataset for Post-OCR Parsing}, + author={Park, Seunghyun and Shin, Seung and Lee, Bado and Lee, Junyeop and Surh, Jaeheung and Seo, Minjoon and Lee, Hwalsuk} + booktitle={Document Intelligence Workshop at Neural Information Processing Systems} + year={2019} +} +""" +_DESCRIPTION = """\ +https://github.com/clovaai/cord/ +""" + +def quad_to_box(quad): + # test 87 is wrongly annotated + box = ( + max(0, quad["x1"]), + max(0, quad["y1"]), + quad["x3"], + quad["y3"] + ) + if box[3] < box[1]: + bbox = list(box) + tmp = bbox[3] + bbox[3] = bbox[1] + bbox[1] = tmp + box = tuple(bbox) + if box[2] < box[0]: + bbox = list(box) + tmp = bbox[2] + bbox[2] = bbox[0] + bbox[0] = tmp + box = tuple(bbox) + return box + +def _get_drive_url(url): + base_url = 'https://drive.google.com/uc?id=' + split_url = url.split('/') + return base_url + split_url[5] + +_URLS = [ + _get_drive_url("https://drive.google.com/file/d/1MqhTbcj-AHXOqYoeoh12aRUwIprzTJYI/"), + _get_drive_url("https://drive.google.com/file/d/1wYdp5nC9LnHQZ2FcmOoC0eClyWvcuARU/") + # If you failed to download the dataset through the automatic downloader, + # you can download it manually and modify the code to get the local dataset. + # Or you can use the following links. Please follow the original LICENSE of CORD for usage. + # "https://layoutlm.blob.core.windows.net/cord/CORD-1k-001.zip", + # "https://layoutlm.blob.core.windows.net/cord/CORD-1k-002.zip" +] + +class CordConfig(datasets.BuilderConfig): + """BuilderConfig for CORD""" + def __init__(self, **kwargs): + """BuilderConfig for CORD. + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(CordConfig, self).__init__(**kwargs) + +class Cord(datasets.GeneratorBasedBuilder): + BUILDER_CONFIGS = [ + CordConfig(name="cord", version=datasets.Version("1.0.0"), description="CORD dataset"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value("string"), + "words": datasets.Sequence(datasets.Value("string")), + "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=["O","B-MENU.NM","B-MENU.NUM","B-MENU.UNITPRICE","B-MENU.CNT","B-MENU.DISCOUNTPRICE","B-MENU.PRICE","B-MENU.ITEMSUBTOTAL","B-MENU.VATYN","B-MENU.ETC","B-MENU.SUB_NM","B-MENU.SUB_UNITPRICE","B-MENU.SUB_CNT","B-MENU.SUB_PRICE","B-MENU.SUB_ETC","B-VOID_MENU.NM","B-VOID_MENU.PRICE","B-SUB_TOTAL.SUBTOTAL_PRICE","B-SUB_TOTAL.DISCOUNT_PRICE","B-SUB_TOTAL.SERVICE_PRICE","B-SUB_TOTAL.OTHERSVC_PRICE","B-SUB_TOTAL.TAX_PRICE","B-SUB_TOTAL.ETC","B-TOTAL.TOTAL_PRICE","B-TOTAL.TOTAL_ETC","B-TOTAL.CASHPRICE","B-TOTAL.CHANGEPRICE","B-TOTAL.CREDITCARDPRICE","B-TOTAL.EMONEYPRICE","B-TOTAL.MENUTYPE_CNT","B-TOTAL.MENUQTY_CNT","I-MENU.NM","I-MENU.NUM","I-MENU.UNITPRICE","I-MENU.CNT","I-MENU.DISCOUNTPRICE","I-MENU.PRICE","I-MENU.ITEMSUBTOTAL","I-MENU.VATYN","I-MENU.ETC","I-MENU.SUB_NM","I-MENU.SUB_UNITPRICE","I-MENU.SUB_CNT","I-MENU.SUB_PRICE","I-MENU.SUB_ETC","I-VOID_MENU.NM","I-VOID_MENU.PRICE","I-SUB_TOTAL.SUBTOTAL_PRICE","I-SUB_TOTAL.DISCOUNT_PRICE","I-SUB_TOTAL.SERVICE_PRICE","I-SUB_TOTAL.OTHERSVC_PRICE","I-SUB_TOTAL.TAX_PRICE","I-SUB_TOTAL.ETC","I-TOTAL.TOTAL_PRICE","I-TOTAL.TOTAL_ETC","I-TOTAL.CASHPRICE","I-TOTAL.CHANGEPRICE","I-TOTAL.CREDITCARDPRICE","I-TOTAL.EMONEYPRICE","I-TOTAL.MENUTYPE_CNT","I-TOTAL.MENUQTY_CNT"] + ) + ), + "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), + "image_path": datasets.Value("string"), + } + ), + supervised_keys=None, + citation=_CITATION, + homepage="https://github.com/clovaai/cord/", + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + """Uses local files located with data_dir""" + downloaded_file = dl_manager.download_and_extract(_URLS) + # move files from the second URL together with files from the first one. + dest = Path(downloaded_file[0])/"CORD" + for split in ["train", "dev", "test"]: + for file_type in ["image", "json"]: + if split == "test" and file_type == "json": + continue + files = (Path(downloaded_file[1])/"CORD"/split/file_type).iterdir() + for f in files: + os.rename(f, dest/split/file_type/f.name) + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"} + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, gen_kwargs={"filepath": dest/"dev"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"} + ), + ] + + def get_line_bbox(self, bboxs): + x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] + y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] + + x0, y0, x1, y1 = min(x), min(y), max(x), max(y) + + assert x1 >= x0 and y1 >= y0 + bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] + return bbox + + def _generate_examples(self, filepath): + logger.info("⏳ Generating examples from = %s", filepath) + ann_dir = os.path.join(filepath, "json") + img_dir = os.path.join(filepath, "image") + for guid, file in enumerate(sorted(os.listdir(ann_dir))): + words = [] + bboxes = [] + ner_tags = [] + file_path = os.path.join(ann_dir, file) + with open(file_path, "r", encoding="utf8") as f: + data = json.load(f) + image_path = os.path.join(img_dir, file) + image_path = image_path.replace("json", "png") + image, size = load_image(image_path) + for item in data["valid_line"]: + cur_line_bboxes = [] + line_words, label = item["words"], item["category"] + line_words = [w for w in line_words if w["text"].strip() != ""] + if len(line_words) == 0: + continue + if label == "other": + for w in line_words: + words.append(w["text"]) + ner_tags.append("O") + cur_line_bboxes.append(normalize_bbox(quad_to_box(w["quad"]), size)) + else: + words.append(line_words[0]["text"]) + ner_tags.append("B-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(quad_to_box(line_words[0]["quad"]), size)) + for w in line_words[1:]: + words.append(w["text"]) + ner_tags.append("I-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(quad_to_box(w["quad"]), size)) + # by default: --segment_level_layout 1 + # if do not want to use segment_level_layout, comment the following line + cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) + bboxes.extend(cur_line_bboxes) + # yield guid, {"id": str(guid), "words": words, "bboxes": bboxes, "ner_tags": ner_tags, "image": image} + yield guid, {"id": str(guid), "words": words, "bboxes": bboxes, "ner_tags": ner_tags, + "image": image, "image_path": image_path} diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py new file mode 100644 index 0000000000000000000000000000000000000000..4232a6660ba2678ba20be5479629550419a798b4 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py @@ -0,0 +1,124 @@ +import torch +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +from transformers import BatchEncoding, PreTrainedTokenizerBase +from transformers.data.data_collator import ( + DataCollatorMixin, + _torch_collate_batch, +) +from transformers.file_utils import PaddingStrategy + +from typing import NewType +InputDataClass = NewType("InputDataClass", Any) + +def pre_calc_rel_mat(segment_ids): + valid_span = torch.zeros((segment_ids.shape[0], segment_ids.shape[1], segment_ids.shape[1]), + device=segment_ids.device, dtype=torch.bool) + for i in range(segment_ids.shape[0]): + for j in range(segment_ids.shape[1]): + valid_span[i, j, :] = segment_ids[i, :] == segment_ids[i, j] + + return valid_span + +@dataclass +class DataCollatorForKeyValueExtraction(DataCollatorMixin): + """ + Data collator that will dynamically pad the inputs received, as well as the labels. + Args: + tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): + The tokenizer used for encoding the data. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding index) + among: + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.5 (Volta). + label_pad_token_id (:obj:`int`, `optional`, defaults to -100): + The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). + """ + + tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + label_pad_token_id: int = -100 + + def __call__(self, features): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None + + images = None + if "images" in features[0]: + images = torch.stack([torch.tensor(d.pop("images")) for d in features]) + IMAGE_LEN = int(images.shape[-1] / 16) * int(images.shape[-1] / 16) + 1 + + batch = self.tokenizer.pad( + features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + # Conversion to tensors will fail if we have labels as they are not of the same length yet. + return_tensors="pt" if labels is None else None, + ) + + if images is not None: + batch["images"] = images + batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) and k == 'attention_mask' else v + for k, v in batch.items()} + visual_attention_mask = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) + batch["attention_mask"] = torch.cat([batch['attention_mask'], visual_attention_mask], dim=1) + + if labels is None: + return batch + + has_bbox_input = "bbox" in features[0] + has_position_input = "position_ids" in features[0] + padding_idx=self.tokenizer.pad_token_id + sequence_length = torch.tensor(batch["input_ids"]).shape[1] + padding_side = self.tokenizer.padding_side + if padding_side == "right": + batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] + if has_bbox_input: + batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]] + if has_position_input: + batch["position_ids"] = [position_id + [padding_idx] * (sequence_length - len(position_id)) + for position_id in batch["position_ids"]] + + else: + batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] + if has_bbox_input: + batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]] + if has_position_input: + batch["position_ids"] = [[padding_idx] * (sequence_length - len(position_id)) + + position_id for position_id in batch["position_ids"]] + + if 'segment_ids' in batch: + assert 'position_ids' in batch + for i in range(len(batch['segment_ids'])): + batch['segment_ids'][i] = batch['segment_ids'][i] + [batch['segment_ids'][i][-1] + 1] * (sequence_length - len(batch['segment_ids'][i])) + [ + batch['segment_ids'][i][-1] + 2] * IMAGE_LEN + + batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()} + + if 'segment_ids' in batch: + valid_span = pre_calc_rel_mat( + segment_ids=batch['segment_ids'] + ) + batch['valid_span'] = valid_span + del batch['segment_ids'] + + if images is not None: + visual_labels = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) * -100 + batch["labels"] = torch.cat([batch['labels'], visual_labels], dim=1) + + return batch diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py new file mode 100644 index 0000000000000000000000000000000000000000..9f34042023042b10d52906d4ba5ca9c87e65a600 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py @@ -0,0 +1,136 @@ +# coding=utf-8 +''' +Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py +''' +import json +import os + +import datasets + +from .image_utils import load_image, normalize_bbox + + +logger = datasets.logging.get_logger(__name__) + + +_CITATION = """\ +@article{Jaume2019FUNSDAD, + title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, + author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, + journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, + year={2019}, + volume={2}, + pages={1-6} +} +""" + +_DESCRIPTION = """\ +https://guillaumejaume.github.io/FUNSD/ +""" + + +class FunsdConfig(datasets.BuilderConfig): + """BuilderConfig for FUNSD""" + + def __init__(self, **kwargs): + """BuilderConfig for FUNSD. + + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(FunsdConfig, self).__init__(**kwargs) + + +class Funsd(datasets.GeneratorBasedBuilder): + """Conll2003 dataset.""" + + BUILDER_CONFIGS = [ + FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), + ] + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features( + { + "id": datasets.Value("string"), + "tokens": datasets.Sequence(datasets.Value("string")), + "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), + "ner_tags": datasets.Sequence( + datasets.features.ClassLabel( + names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] + ) + ), + "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), + "image_path": datasets.Value("string"), + } + ), + supervised_keys=None, + homepage="https://guillaumejaume.github.io/FUNSD/", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} + ), + ] + + def get_line_bbox(self, bboxs): + x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] + y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] + + x0, y0, x1, y1 = min(x), min(y), max(x), max(y) + + assert x1 >= x0 and y1 >= y0 + bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] + return bbox + + def _generate_examples(self, filepath): + logger.info("⏳ Generating examples from = %s", filepath) + ann_dir = os.path.join(filepath, "annotations") + img_dir = os.path.join(filepath, "images") + for guid, file in enumerate(sorted(os.listdir(ann_dir))): + tokens = [] + bboxes = [] + ner_tags = [] + + file_path = os.path.join(ann_dir, file) + with open(file_path, "r", encoding="utf8") as f: + data = json.load(f) + image_path = os.path.join(img_dir, file) + image_path = image_path.replace("json", "png") + image, size = load_image(image_path) + for item in data["form"]: + cur_line_bboxes = [] + words, label = item["words"], item["label"] + words = [w for w in words if w["text"].strip() != ""] + if len(words) == 0: + continue + if label == "other": + for w in words: + tokens.append(w["text"]) + ner_tags.append("O") + cur_line_bboxes.append(normalize_bbox(w["box"], size)) + else: + tokens.append(words[0]["text"]) + ner_tags.append("B-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(words[0]["box"], size)) + for w in words[1:]: + tokens.append(w["text"]) + ner_tags.append("I-" + label.upper()) + cur_line_bboxes.append(normalize_bbox(w["box"], size)) + # by default: --segment_level_layout 1 + # if do not want to use segment_level_layout, comment the following line + cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) + # box = normalize_bbox(item["box"], size) + # cur_line_bboxes = [box for _ in range(len(words))] + bboxes.extend(cur_line_bboxes) + yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, + "image": image, "image_path": image_path} \ No newline at end of file diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..90a4b34373980246d6397b95b91e84461f3f2580 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py @@ -0,0 +1,284 @@ +import torchvision.transforms.functional as F +import warnings +import math +import random +import numpy as np +from PIL import Image +import torch + +from detectron2.data.detection_utils import read_image +from detectron2.data.transforms import ResizeTransform, TransformList + +def normalize_bbox(bbox, size): + return [ + int(1000 * bbox[0] / size[0]), + int(1000 * bbox[1] / size[1]), + int(1000 * bbox[2] / size[0]), + int(1000 * bbox[3] / size[1]), + ] + + +def load_image(image_path): + image = read_image(image_path, format="BGR") + h = image.shape[0] + w = image.shape[1] + img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)]) + image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1) # copy to make it writeable + return image, (w, h) + + +def crop(image, i, j, h, w, boxes=None): + cropped_image = F.crop(image, i, j, h, w) + + if boxes is not None: + # Currently we cannot use this case since when some boxes is out of the cropped image, + # it may be better to drop out these boxes along with their text input (instead of min or clamp) + # which haven't been implemented here + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = torch.as_tensor(boxes) - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + boxes = cropped_boxes.reshape(-1, 4) + + return cropped_image, boxes + + +def resize(image, size, interpolation, boxes=None): + # It seems that we do not need to resize boxes here, since the boxes will be resized to 1000x1000 finally, + # which is compatible with a square image size of 224x224 + rescaled_image = F.resize(image, size, interpolation) + + if boxes is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + # boxes = boxes.copy() + scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + + return rescaled_image, scaled_boxes + + +def clamp(num, min_value, max_value): + return max(min(num, max_value), min_value) + + +def get_bb(bb, page_size): + bbs = [float(j) for j in bb] + xs, ys = [], [] + for i, b in enumerate(bbs): + if i % 2 == 0: + xs.append(b) + else: + ys.append(b) + (width, height) = page_size + return_bb = [ + clamp(min(xs), 0, width - 1), + clamp(min(ys), 0, height - 1), + clamp(max(xs), 0, width - 1), + clamp(max(ys), 0, height - 1), + ] + return_bb = [ + int(1000 * return_bb[0] / width), + int(1000 * return_bb[1] / height), + int(1000 * return_bb[2] / width), + int(1000 * return_bb[3] / height), + ] + return return_bb + + +class ToNumpy: + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return np_img + + +class ToTensor: + + def __init__(self, dtype=torch.float32): + self.dtype = dtype + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return torch.from_numpy(np_img).to(dtype=self.dtype) + + +_pil_interpolation_to_str = { + F.InterpolationMode.NEAREST: 'F.InterpolationMode.NEAREST', + F.InterpolationMode.BILINEAR: 'F.InterpolationMode.BILINEAR', + F.InterpolationMode.BICUBIC: 'F.InterpolationMode.BICUBIC', + F.InterpolationMode.LANCZOS: 'F.InterpolationMode.LANCZOS', + F.InterpolationMode.HAMMING: 'F.InterpolationMode.HAMMING', + F.InterpolationMode.BOX: 'F.InterpolationMode.BOX', +} + + +def _pil_interp(method): + if method == 'bicubic': + return F.InterpolationMode.BICUBIC + elif method == 'lanczos': + return F.InterpolationMode.LANCZOS + elif method == 'hamming': + return F.InterpolationMode.HAMMING + else: + # default bilinear, do we want to allow nearest? + return F.InterpolationMode.BILINEAR + + +class Compose: + """Composes several transforms together. This transform does not support torchscript. + Please, see the note below. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.PILToTensor(), + >>> transforms.ConvertImageDtype(torch.float), + >>> ]) + + .. note:: + In order to script the transformations, please use ``torch.nn.Sequential`` as below. + + >>> transforms = torch.nn.Sequential( + >>> transforms.CenterCrop(10), + >>> transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + >>> ) + >>> scripted_transforms = torch.jit.script(transforms) + + Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require + `lambda` functions or ``PIL.Image``. + + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img, augmentation=False, box=None): + for t in self.transforms: + img = t(img, augmentation, box) + return img + + +class RandomResizedCropAndInterpolationWithTwoPic: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear', second_interpolation='lanczos'): + if isinstance(size, tuple): + self.size = size + else: + self.size = (size, size) + if second_size is not None: + if isinstance(second_size, tuple): + self.second_size = second_size + else: + self.second_size = (second_size, second_size) + else: + self.second_size = None + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + self.interpolation = _pil_interp(interpolation) + self.second_interpolation = _pil_interp(second_interpolation) + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img, augmentation=False, box=None): + """ + Args: + img (PIL Image): Image to be cropped and resized. + Returns: + PIL Image: Randomly cropped and resized image. + """ + if augmentation: + i, j, h, w = self.get_params(img, self.scale, self.ratio) + img = F.crop(img, i, j, h, w) + # img, box = crop(img, i, j, h, w, box) + img = F.resize(img, self.size, self.interpolation) + second_img = F.resize(img, self.second_size, self.second_interpolation) \ + if self.second_size is not None else None + return img, second_img + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation]) + else: + interpolate_str = _pil_interpolation_to_str[self.interpolation] + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0}'.format(interpolate_str) + if self.second_size is not None: + format_string += ', second_size={0}'.format(self.second_size) + format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation]) + format_string += ')' + return format_string + + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py new file mode 100644 index 0000000000000000000000000000000000000000..7749ba5dd1d59a4e0c5baf4f2c27cffaae3e4e12 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py @@ -0,0 +1,213 @@ +import os +import json + +import torch +from torch.utils.data.dataset import Dataset +from torchvision import transforms +from PIL import Image + +from .image_utils import Compose, RandomResizedCropAndInterpolationWithTwoPic + +XFund_label2ids = { + "O":0, + 'B-HEADER':1, + 'I-HEADER':2, + 'B-QUESTION':3, + 'I-QUESTION':4, + 'B-ANSWER':5, + 'I-ANSWER':6, +} + +class xfund_dataset(Dataset): + def box_norm(self, box, width, height): + def clip(min_num, num, max_num): + return min(max(num, min_num), max_num) + + x0, y0, x1, y1 = box + x0 = clip(0, int((x0 / width) * 1000), 1000) + y0 = clip(0, int((y0 / height) * 1000), 1000) + x1 = clip(0, int((x1 / width) * 1000), 1000) + y1 = clip(0, int((y1 / height) * 1000), 1000) + assert x1 >= x0 + assert y1 >= y0 + return [x0, y0, x1, y1] + + def get_segment_ids(self, bboxs): + segment_ids = [] + for i in range(len(bboxs)): + if i == 0: + segment_ids.append(0) + else: + if bboxs[i - 1] == bboxs[i]: + segment_ids.append(segment_ids[-1]) + else: + segment_ids.append(segment_ids[-1] + 1) + return segment_ids + + def get_position_ids(self, segment_ids): + position_ids = [] + for i in range(len(segment_ids)): + if i == 0: + position_ids.append(2) + else: + if segment_ids[i] == segment_ids[i - 1]: + position_ids.append(position_ids[-1] + 1) + else: + position_ids.append(2) + return position_ids + + def load_data( + self, + data_file, + ): + # re-org data format + total_data = {"id": [], "lines": [], "bboxes": [], "ner_tags": [], "image_path": []} + for i in range(len(data_file['documents'])): + width, height = data_file['documents'][i]['img']['width'], data_file['documents'][i]['img'][ + 'height'] + + cur_doc_lines, cur_doc_bboxes, cur_doc_ner_tags, cur_doc_image_path = [], [], [], [] + for j in range(len(data_file['documents'][i]['document'])): + cur_item = data_file['documents'][i]['document'][j] + cur_doc_lines.append(cur_item['text']) + cur_doc_bboxes.append(self.box_norm(cur_item['box'], width=width, height=height)) + cur_doc_ner_tags.append(cur_item['label']) + total_data['id'] += [len(total_data['id'])] + total_data['lines'] += [cur_doc_lines] + total_data['bboxes'] += [cur_doc_bboxes] + total_data['ner_tags'] += [cur_doc_ner_tags] + total_data['image_path'] += [data_file['documents'][i]['img']['fname']] + + # tokenize text and get bbox/label + total_input_ids, total_bboxs, total_label_ids = [], [], [] + for i in range(len(total_data['lines'])): + cur_doc_input_ids, cur_doc_bboxs, cur_doc_labels = [], [], [] + for j in range(len(total_data['lines'][i])): + cur_input_ids = self.tokenizer(total_data['lines'][i][j], truncation=False, add_special_tokens=False, return_attention_mask=False)['input_ids'] + if len(cur_input_ids) == 0: continue + + cur_label = total_data['ner_tags'][i][j].upper() + if cur_label == 'OTHER': + cur_labels = ["O"] * len(cur_input_ids) + for k in range(len(cur_labels)): + cur_labels[k] = self.label2ids[cur_labels[k]] + else: + cur_labels = [cur_label] * len(cur_input_ids) + cur_labels[0] = self.label2ids['B-' + cur_labels[0]] + for k in range(1, len(cur_labels)): + cur_labels[k] = self.label2ids['I-' + cur_labels[k]] + assert len(cur_input_ids) == len([total_data['bboxes'][i][j]] * len(cur_input_ids)) == len(cur_labels) + cur_doc_input_ids += cur_input_ids + cur_doc_bboxs += [total_data['bboxes'][i][j]] * len(cur_input_ids) + cur_doc_labels += cur_labels + assert len(cur_doc_input_ids) == len(cur_doc_bboxs) == len(cur_doc_labels) + assert len(cur_doc_input_ids) > 0 + + total_input_ids.append(cur_doc_input_ids) + total_bboxs.append(cur_doc_bboxs) + total_label_ids.append(cur_doc_labels) + assert len(total_input_ids) == len(total_bboxs) == len(total_label_ids) + + # split text to several slices because of over-length + input_ids, bboxs, labels = [], [], [] + segment_ids, position_ids = [], [] + image_path = [] + for i in range(len(total_input_ids)): + start = 0 + cur_iter = 0 + while start < len(total_input_ids[i]): + end = min(start + 510, len(total_input_ids[i])) + + input_ids.append([self.tokenizer.cls_token_id] + total_input_ids[i][start: end] + [self.tokenizer.sep_token_id]) + bboxs.append([[0, 0, 0, 0]] + total_bboxs[i][start: end] + [[1000, 1000, 1000, 1000]]) + labels.append([-100] + total_label_ids[i][start: end] + [-100]) + + cur_segment_ids = self.get_segment_ids(bboxs[-1]) + cur_position_ids = self.get_position_ids(cur_segment_ids) + segment_ids.append(cur_segment_ids) + position_ids.append(cur_position_ids) + image_path.append(os.path.join(self.args.data_dir, "images", total_data['image_path'][i])) + + start = end + cur_iter += 1 + + assert len(input_ids) == len(bboxs) == len(labels) == len(segment_ids) == len(position_ids) + assert len(segment_ids) == len(image_path) + + res = { + 'input_ids': input_ids, + 'bbox': bboxs, + 'labels': labels, + 'segment_ids': segment_ids, + 'position_ids': position_ids, + 'image_path': image_path, + } + return res + + def __init__( + self, + args, + tokenizer, + mode + ): + self.args = args + self.mode = mode + self.cur_la = args.language + self.tokenizer = tokenizer + self.label2ids = XFund_label2ids + + + self.common_transform = Compose([ + RandomResizedCropAndInterpolationWithTwoPic( + size=args.input_size, interpolation=args.train_interpolation, + ), + ]) + + self.patch_transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor((0.5, 0.5, 0.5)), + std=torch.tensor((0.5, 0.5, 0.5))) + ]) + + data_file = json.load( + open(os.path.join(args.data_dir, "{}.{}.json".format(self.cur_la, 'train' if mode == 'train' else 'val')), + 'r')) + + self.feature = self.load_data(data_file) + + def __len__(self): + return len(self.feature['input_ids']) + + def __getitem__(self, index): + input_ids = self.feature["input_ids"][index] + + # attention_mask = self.feature["attention_mask"][index] + attention_mask = [1] * len(input_ids) + labels = self.feature["labels"][index] + bbox = self.feature["bbox"][index] + segment_ids = self.feature['segment_ids'][index] + position_ids = self.feature['position_ids'][index] + + img = pil_loader(self.feature['image_path'][index]) + for_patches, _ = self.common_transform(img, augmentation=False) + patch = self.patch_transform(for_patches) + + assert len(input_ids) == len(attention_mask) == len(labels) == len(bbox) == len(segment_ids) + + res = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "labels": labels, + "bbox": bbox, + "segment_ids": segment_ids, + "position_ids": position_ids, + "images": patch, + } + return res + +def pil_loader(path: str) -> Image.Image: + # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835) + with open(path, 'rb') as f: + img = Image.open(f) + return img.convert('RGB') \ No newline at end of file diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b3100effb34547bbaba7503288db34374cad9ca --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py @@ -0,0 +1,7 @@ +from .layoutlmv3 import ( + LayoutLMv3Config, + LayoutLMv3ForTokenClassification, + LayoutLMv3ForQuestionAnswering, + LayoutLMv3ForSequenceClassification, + LayoutLMv3Tokenizer, +) diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e06a24b0ca9971cfe99dc9ef60ce8e495ff406bd --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py @@ -0,0 +1,24 @@ +from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, \ + AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer +from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, RobertaConverter + +from .configuration_layoutlmv3 import LayoutLMv3Config +from .modeling_layoutlmv3 import ( + LayoutLMv3ForTokenClassification, + LayoutLMv3ForQuestionAnswering, + LayoutLMv3ForSequenceClassification, + LayoutLMv3Model, +) +from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer +from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast + + +#AutoConfig.register("layoutlmv3", LayoutLMv3Config) +#AutoModel.register(LayoutLMv3Config, LayoutLMv3Model) +#AutoModelForTokenClassification.register(LayoutLMv3Config, LayoutLMv3ForTokenClassification) +#AutoModelForQuestionAnswering.register(LayoutLMv3Config, LayoutLMv3ForQuestionAnswering) +#AutoModelForSequenceClassification.register(LayoutLMv3Config, LayoutLMv3ForSequenceClassification) +#AutoTokenizer.register( +# LayoutLMv3Config, slow_tokenizer_class=LayoutLMv3Tokenizer, fast_tokenizer_class=LayoutLMv3TokenizerFast +#) +SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv3Tokenizer": RobertaConverter}) diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..d2c7b4d71b4d51504dee8bc10e50ea91bac00270 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py @@ -0,0 +1,60 @@ +# coding=utf-8 +from transformers.models.bert.configuration_bert import BertConfig +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json", + "layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/resolve/main/config.json", + # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3 +} + + +class LayoutLMv3Config(BertConfig): + model_type = "layoutlmv3" + + def __init__( + self, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + max_2d_position_embeddings=1024, + coordinate_size=None, + shape_size=None, + has_relative_attention_bias=False, + rel_pos_bins=32, + max_rel_pos=128, + has_spatial_attention_bias=False, + rel_2d_pos_bins=64, + max_rel_2d_pos=256, + visual_embed=True, + mim=False, + wpa_task=False, + discrete_vae_weight_path='', + discrete_vae_type='dall-e', + input_size=224, + second_input_size=112, + device='cuda', + **kwargs + ): + """Constructs RobertaConfig.""" + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + self.max_2d_position_embeddings = max_2d_position_embeddings + self.coordinate_size = coordinate_size + self.shape_size = shape_size + self.has_relative_attention_bias = has_relative_attention_bias + self.rel_pos_bins = rel_pos_bins + self.max_rel_pos = max_rel_pos + self.has_spatial_attention_bias = has_spatial_attention_bias + self.rel_2d_pos_bins = rel_2d_pos_bins + self.max_rel_2d_pos = max_rel_2d_pos + self.visual_embed = visual_embed + self.mim = mim + self.wpa_task = wpa_task + self.discrete_vae_weight_path = discrete_vae_weight_path + self.discrete_vae_type = discrete_vae_type + self.input_size = input_size + self.second_input_size = second_input_size + self.device = device diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..113eb8eb1d123a4985c1894e0caab561b19f64c2 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py @@ -0,0 +1,1282 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch LayoutLMv3 model. """ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from transformers import apply_chunking_to_forward +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MaskedLMOutput, + TokenClassifierOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, +) +from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer +from transformers.models.roberta.modeling_roberta import ( + RobertaIntermediate, + RobertaLMHead, + RobertaOutput, + RobertaSelfOutput, +) +from transformers.utils import logging + +from .configuration_layoutlmv3 import LayoutLMv3Config +from timm.models.layers import to_2tuple + + +logger = logging.get_logger(__name__) + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + # The following variables are used in detection mycheckpointer.py + self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.num_patches_w = self.patch_shape[0] + self.num_patches_h = self.patch_shape[1] + + def forward(self, x, position_embedding=None): + x = self.proj(x) + + if position_embedding is not None: + # interpolate the position embedding to the corresponding size + position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2) + Hp, Wp = x.shape[2], x.shape[3] + position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic') + x = x + position_embedding + + x = x.flatten(2).transpose(1, 2) + return x + +class LayoutLMv3Embeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size) + self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size) + self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size) + self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size) + + def _calc_spatial_position_embeddings(self, bbox): + try: + assert torch.all(0 <= bbox) and torch.all(bbox <= 1023) + left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0]) + upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1]) + right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2]) + lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3]) + except IndexError as e: + raise IndexError("The :obj:`bbox` coordinate values should be within 0-1000 range.") from e + + h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023)) + w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023)) + + # below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add) + spatial_position_embeddings = torch.cat( + [ + left_position_embeddings, + upper_position_embeddings, + right_position_embeddings, + lower_position_embeddings, + h_position_embeddings, + w_position_embeddings, + ], + dim=-1, + ) + return spatial_position_embeddings + + def create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + def forward( + self, + input_ids=None, + bbox=None, + token_type_ids=None, + position_ids=None, + inputs_embeds=None, + past_key_values_length=0, + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids( + input_ids, self.padding_idx, past_key_values_length).to(input_ids.device) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + spatial_position_embeddings = self._calc_spatial_position_embeddings(bbox) + + embeddings = embeddings + spatial_position_embeddings + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor≈ + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +class LayoutLMv3PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = LayoutLMv3Config + base_model_prefix = "layoutlmv3" + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +class LayoutLMv3SelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.has_relative_attention_bias = config.has_relative_attention_bias + self.has_spatial_attention_bias = config.has_spatial_attention_bias + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def cogview_attn(self, attention_scores, alpha=32): + ''' + https://arxiv.org/pdf/2105.13290.pdf + Section 2.4 Stabilization of training: Precision Bottleneck Relaxation (PB-Relax). + A replacement of the original nn.Softmax(dim=-1)(attention_scores) + Seems the new attention_probs will result in a slower speed and a little bias + Can use torch.allclose(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison + The smaller atol (e.g., 1e-08), the better. + ''' + scaled_attention_scores = attention_scores / alpha + max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1) + # max_value = scaled_attention_scores.amax(dim=(-2, -1)).unsqueeze(-1).unsqueeze(-1) + new_attention_scores = (scaled_attention_scores - max_value) * alpha + return nn.Softmax(dim=-1)(new_attention_scores) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + rel_pos=None, + rel_2d_pos=None, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # The attention scores QT K/√d could be significantly larger than input elements, and result in overflow. + # Changing the computational order into QT(K/√d) alleviates the problem. (https://arxiv.org/pdf/2105.13290.pdf) + attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2)) + + if self.has_relative_attention_bias and self.has_spatial_attention_bias: + attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size) + elif self.has_relative_attention_bias: + attention_scores += rel_pos / math.sqrt(self.attention_head_size) + + # if self.has_relative_attention_bias: + # attention_scores += rel_pos + # if self.has_spatial_attention_bias: + # attention_scores += rel_2d_pos + + # attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + # attention_probs = nn.Softmax(dim=-1)(attention_scores) # comment the line below and use this line for speedup + attention_probs = self.cogview_attn(attention_scores) # to stablize training + # assert torch.allclose(attention_probs, nn.Softmax(dim=-1)(attention_scores), atol=1e-8) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class LayoutLMv3Attention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = LayoutLMv3SelfAttention(config) + self.output = RobertaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + rel_pos=None, + rel_2d_pos=None, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + rel_pos=rel_pos, + rel_2d_pos=rel_2d_pos, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class LayoutLMv3Layer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = LayoutLMv3Attention(config) + assert not config.is_decoder and not config.add_cross_attention, \ + "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder." + self.intermediate = RobertaIntermediate(config) + self.output = RobertaOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + rel_pos=None, + rel_2d_pos=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + rel_pos=rel_pos, + rel_2d_pos=rel_2d_pos, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class LayoutLMv3Encoder(nn.Module): + def __init__(self, config, detection=False, out_features=None): + super().__init__() + self.config = config + self.detection = detection + self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + self.has_relative_attention_bias = config.has_relative_attention_bias + self.has_spatial_attention_bias = config.has_spatial_attention_bias + + if self.has_relative_attention_bias: + self.rel_pos_bins = config.rel_pos_bins + self.max_rel_pos = config.max_rel_pos + self.rel_pos_onehot_size = config.rel_pos_bins + self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False) + + if self.has_spatial_attention_bias: + self.max_rel_2d_pos = config.max_rel_2d_pos + self.rel_2d_pos_bins = config.rel_2d_pos_bins + self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins + self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False) + self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False) + + if self.detection: + self.gradient_checkpointing = True + embed_dim = self.config.hidden_size + self.out_features = out_features + self.out_indices = [int(name[5:]) for name in out_features] + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + # nn.SyncBatchNorm(embed_dim), + nn.BatchNorm2d(embed_dim), + nn.GELU(), + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Sequential( + nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), + ) + + self.fpn3 = nn.Identity() + + self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) + self.ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + + def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128): + ret = 0 + if bidirectional: + num_buckets //= 2 + ret += (relative_position > 0).long() * num_buckets + n = torch.abs(relative_position) + else: + n = torch.max(-relative_position, torch.zeros_like(relative_position)) + # now n is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = n < max_exact + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + val_if_large = max_exact + ( + torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact) + ).to(torch.long) + val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) + + ret += torch.where(is_small, n, val_if_large) + return ret + + def _cal_1d_pos_emb(self, hidden_states, position_ids, valid_span): + VISUAL_NUM = 196 + 1 + + rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1) + + if valid_span is not None: + # for the text part, if two words are not in the same line, + # set their distance to the max value (position_ids.shape[-1]) + rel_pos_mat[(rel_pos_mat > 0) & (valid_span == False)] = position_ids.shape[1] + rel_pos_mat[(rel_pos_mat < 0) & (valid_span == False)] = -position_ids.shape[1] + + # image-text, minimum distance + rel_pos_mat[:, -VISUAL_NUM:, :-VISUAL_NUM] = 0 + rel_pos_mat[:, :-VISUAL_NUM, -VISUAL_NUM:] = 0 + + rel_pos = self.relative_position_bucket( + rel_pos_mat, + num_buckets=self.rel_pos_bins, + max_distance=self.max_rel_pos, + ) + rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states) + rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2) + rel_pos = rel_pos.contiguous() + return rel_pos + + def _cal_2d_pos_emb(self, hidden_states, bbox): + position_coord_x = bbox[:, :, 0] + position_coord_y = bbox[:, :, 3] + rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1) + rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1) + rel_pos_x = self.relative_position_bucket( + rel_pos_x_2d_mat, + num_buckets=self.rel_2d_pos_bins, + max_distance=self.max_rel_2d_pos, + ) + rel_pos_y = self.relative_position_bucket( + rel_pos_y_2d_mat, + num_buckets=self.rel_2d_pos_bins, + max_distance=self.max_rel_2d_pos, + ) + rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states) + rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states) + rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2) + rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2) + rel_pos_x = rel_pos_x.contiguous() + rel_pos_y = rel_pos_y.contiguous() + rel_2d_pos = rel_pos_x + rel_pos_y + return rel_2d_pos + + def forward( + self, + hidden_states, + bbox=None, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + position_ids=None, + Hp=None, + Wp=None, + valid_span=None, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + + rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids, valid_span) if self.has_relative_attention_bias else None + rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None + + if self.detection: + feat_out = {} + j = 0 + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + # return module(*inputs, past_key_value, output_attentions, rel_pos, rel_2d_pos) + # The above line will cause error: + # RuntimeError: Trying to backward through the graph a second time + # (or directly access saved tensors after they have already been freed). + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + rel_pos, + rel_2d_pos + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + rel_pos=rel_pos, + rel_2d_pos=rel_2d_pos, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if self.detection and i in self.out_indices: + xp = hidden_states[:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp) + feat_out[self.out_features[j]] = self.ops[j](xp.contiguous()) + j += 1 + + if self.detection: + return feat_out + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class LayoutLMv3Model(LayoutLMv3PreTrainedModel): + """ + """ + + _keys_to_ignore_on_load_missing = [r"position_ids"] + + # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta + def __init__(self, config, detection=False, out_features=None, image_only=False): + super().__init__(config) + self.config = config + assert not config.is_decoder and not config.add_cross_attention, \ + "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder." + self.detection = detection + if not self.detection: + self.image_only = False + else: + assert config.visual_embed + self.image_only = image_only + + if not self.image_only: + self.embeddings = LayoutLMv3Embeddings(config) + self.encoder = LayoutLMv3Encoder(config, detection=detection, out_features=out_features) + + if config.visual_embed: + embed_dim = self.config.hidden_size + # use the default pre-training parameters for fine-tuning (e.g., input_size) + # when the input_size is larger in fine-tuning, we will interpolate the position embedding in forward + self.patch_embed = PatchEmbed(embed_dim=embed_dim) + + patch_size = 16 + size = int(self.config.input_size / patch_size) + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=0.) + + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: + self._init_visual_bbox(img_size=(size, size)) + + from functools import partial + norm_layer = partial(nn.LayerNorm, eps=1e-6) + self.norm = norm_layer(embed_dim) + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _init_visual_bbox(self, img_size=(14, 14), max_len=1000): + visual_bbox_x = torch.div(torch.arange(0, max_len * (img_size[1] + 1), max_len), + img_size[1], rounding_mode='trunc') + visual_bbox_y = torch.div(torch.arange(0, max_len * (img_size[0] + 1), max_len), + img_size[0], rounding_mode='trunc') + visual_bbox = torch.stack( + [ + visual_bbox_x[:-1].repeat(img_size[0], 1), + visual_bbox_y[:-1].repeat(img_size[1], 1).transpose(0, 1), + visual_bbox_x[1:].repeat(img_size[0], 1), + visual_bbox_y[1:].repeat(img_size[1], 1).transpose(0, 1), + ], + dim=-1, + ).view(-1, 4) + + cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]]) + self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0) + + def _calc_visual_bbox(self, device, dtype, bsz): # , img_size=(14, 14), max_len=1000): + visual_bbox = self.visual_bbox.repeat(bsz, 1, 1) + visual_bbox = visual_bbox.to(device).type(dtype) + return visual_bbox + + def forward_image(self, x): + if self.detection: + x = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None) + else: + x = self.patch_embed(x) + batch_size, seq_len, _ = x.size() + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + if self.pos_embed is not None and self.detection: + cls_tokens = cls_tokens + self.pos_embed[:, :1, :] + + x = torch.cat((cls_tokens, x), dim=1) + if self.pos_embed is not None and not self.detection: + x = x + self.pos_embed + x = self.pos_drop(x) + + x = self.norm(x) + return x + + # Copied from transformers.models.bert.modeling_bert.BertModel.forward + def forward( + self, + input_ids=None, + bbox=None, + attention_mask=None, + token_type_ids=None, + valid_span=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + images=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + use_cache = False + + # if input_ids is not None and inputs_embeds is not None: + # raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + if input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif images is not None: + batch_size = len(images) + device = images.device + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or images") + + if not self.image_only: + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if not self.image_only: + if bbox is None: + bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + bbox=bbox, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + final_bbox = final_position_ids = None + Hp = Wp = None + if images is not None: + patch_size = 16 + Hp, Wp = int(images.shape[2] / patch_size), int(images.shape[3] / patch_size) + visual_emb = self.forward_image(images) + if self.detection: + visual_attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device) + if self.image_only: + attention_mask = visual_attention_mask + else: + attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1) + elif self.image_only: + attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device) + + if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: + if self.config.has_spatial_attention_bias: + visual_bbox = self._calc_visual_bbox(device, dtype=torch.long, bsz=batch_size) + if self.image_only: + final_bbox = visual_bbox + else: + final_bbox = torch.cat([bbox, visual_bbox], dim=1) + + visual_position_ids = torch.arange(0, visual_emb.shape[1], dtype=torch.long, device=device).repeat( + batch_size, 1) + if self.image_only: + final_position_ids = visual_position_ids + else: + position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0) + position_ids = position_ids.expand_as(input_ids) + final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1) + + if self.image_only: + embedding_output = visual_emb + else: + embedding_output = torch.cat([embedding_output, visual_emb], dim=1) + embedding_output = self.LayerNorm(embedding_output) + embedding_output = self.dropout(embedding_output) + elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias: + if self.config.has_spatial_attention_bias: + final_bbox = bbox + if self.config.has_relative_attention_bias: + position_ids = self.embeddings.position_ids[:, :input_shape[1]] + position_ids = position_ids.expand_as(input_ids) + final_position_ids = position_ids + + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, None, device) + + encoder_outputs = self.encoder( + embedding_output, + bbox=final_bbox, + position_ids=final_position_ids, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + Hp=Hp, + Wp=Wp, + valid_span=valid_span, + ) + + if self.detection: + return encoder_outputs + + sequence_output = encoder_outputs[0] + pooled_output = None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class LayoutLMv3ClassificationHead(nn.Module): + """ + Head for sentence-level classification tasks. + Reference: RobertaClassificationHead + """ + + def __init__(self, config, pool_feature=False): + super().__init__() + self.pool_feature = pool_feature + if pool_feature: + self.dense = nn.Linear(config.hidden_size*3, config.hidden_size) + else: + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, x): + # x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.layoutlmv3 = LayoutLMv3Model(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + if config.num_labels < 10: + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + else: + self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False) + + self.init_weights() + + def forward( + self, + input_ids=None, + bbox=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + valid_span=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + images=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.layoutlmv3( + input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + images=images, + valid_span=valid_span, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.layoutlmv3 = LayoutLMv3Model(config) + # self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False) + + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + valid_span=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + bbox=None, + images=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.layoutlmv3( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + bbox=bbox, + images=images, + valid_span=valid_span, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.layoutlmv3 = LayoutLMv3Model(config) + self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False) + + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + valid_span=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + bbox=None, + images=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.layoutlmv3( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + bbox=bbox, + images=images, + valid_span=valid_span, + ) + + sequence_output = outputs[0][:, 0, :] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py new file mode 100644 index 0000000000000000000000000000000000000000..f340d3c6aca04b6567614e6aa221f7c542239305 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py @@ -0,0 +1,32 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for LayoutLMv3, refer to RoBERTa.""" + +from transformers.models.roberta import RobertaTokenizer +from transformers.utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + +class LayoutLMv3Tokenizer(RobertaTokenizer): + vocab_files_names = VOCAB_FILES_NAMES + # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..9fd75ff1d3bd7725025114e99320afd80823e9d0 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py @@ -0,0 +1,34 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for LayoutLMv3, refer to RoBERTa.""" + + +from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast +from transformers.utils import logging + +from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + + +class LayoutLMv3TokenizerFast(RobertaTokenizerFast): + vocab_files_names = VOCAB_FILES_NAMES + # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = LayoutLMv3Tokenizer diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py new file mode 100644 index 0000000000000000000000000000000000000000..a624d60d5a75902f3c44d3dfbe1ef350cddf7427 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py @@ -0,0 +1,151 @@ +from .visualizer import Visualizer +from .rcnn_vl import * +from .backbone import * + +from detectron2.config import get_cfg +from detectron2.config import CfgNode as CN +from detectron2.data import MetadataCatalog, DatasetCatalog +from detectron2.data.datasets import register_coco_instances +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor + + +def add_vit_config(cfg): + """ + Add config for VIT. + """ + _C = cfg + + _C.MODEL.VIT = CN() + + # CoaT model name. + _C.MODEL.VIT.NAME = "" + + # Output features from CoaT backbone. + _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"] + + _C.MODEL.VIT.IMG_SIZE = [224, 224] + + _C.MODEL.VIT.POS_TYPE = "shared_rel" + + _C.MODEL.VIT.DROP_PATH = 0. + + _C.MODEL.VIT.MODEL_KWARGS = "{}" + + _C.SOLVER.OPTIMIZER = "ADAMW" + + _C.SOLVER.BACKBONE_MULTIPLIER = 1.0 + + _C.AUG = CN() + + _C.AUG.DETR = False + + _C.MODEL.IMAGE_ONLY = True + _C.PUBLAYNET_DATA_DIR_TRAIN = "" + _C.PUBLAYNET_DATA_DIR_TEST = "" + _C.FOOTNOTE_DATA_DIR_TRAIN = "" + _C.FOOTNOTE_DATA_DIR_VAL = "" + _C.SCIHUB_DATA_DIR_TRAIN = "" + _C.SCIHUB_DATA_DIR_TEST = "" + _C.JIAOCAI_DATA_DIR_TRAIN = "" + _C.JIAOCAI_DATA_DIR_TEST = "" + _C.ICDAR_DATA_DIR_TRAIN = "" + _C.ICDAR_DATA_DIR_TEST = "" + _C.M6DOC_DATA_DIR_TEST = "" + _C.DOCSTRUCTBENCH_DATA_DIR_TEST = "" + _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = "" + _C.CACHE_DIR = "" + _C.MODEL.CONFIG_PATH = "" + + # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS + # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS + _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1 + + +def setup(args, device): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + + # add_coat_config(cfg) + add_vit_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2 # set threshold for this model + cfg.merge_from_list(args.opts) + + # 使用统一的device配置 + cfg.MODEL.DEVICE = device + + cfg.freeze() + default_setup(cfg, args) + + #@todo 可以删掉这块? + # register_coco_instances( + # "scihub_train", + # {}, + # cfg.SCIHUB_DATA_DIR_TRAIN + ".json", + # cfg.SCIHUB_DATA_DIR_TRAIN + # ) + + return cfg + + +class DotDict(dict): + def __init__(self, *args, **kwargs): + super(DotDict, self).__init__(*args, **kwargs) + + def __getattr__(self, key): + if key not in self.keys(): + return None + value = self[key] + if isinstance(value, dict): + value = DotDict(value) + return value + + def __setattr__(self, key, value): + self[key] = value + + +class Layoutlmv3_Predictor(object): + def __init__(self, weights, config_file, device): + layout_args = { + "config_file": config_file, + "resume": False, + "eval_only": False, + "num_gpus": 1, + "num_machines": 1, + "machine_rank": 0, + "dist_url": "tcp://127.0.0.1:57823", + "opts": ["MODEL.WEIGHTS", weights], + } + layout_args = DotDict(layout_args) + + cfg = setup(layout_args, device) + self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption", + "table_footnote", "isolate_formula", "formula_caption"] + MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping + self.predictor = DefaultPredictor(cfg) + + def __call__(self, image, ignore_catids=[]): + # page_layout_result = { + # "layout_dets": [] + # } + layout_dets = [] + outputs = self.predictor(image) + boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist() + labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist() + scores = outputs["instances"].to("cpu")._fields["scores"].tolist() + for bbox_idx in range(len(boxes)): + if labels[bbox_idx] in ignore_catids: + continue + layout_dets.append({ + "category_id": labels[bbox_idx], + "poly": [ + boxes[bbox_idx][0], boxes[bbox_idx][1], + boxes[bbox_idx][2], boxes[bbox_idx][1], + boxes[bbox_idx][2], boxes[bbox_idx][3], + boxes[bbox_idx][0], boxes[bbox_idx][3], + ], + "score": scores[bbox_idx] + }) + return layout_dets diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py new file mode 100644 index 0000000000000000000000000000000000000000..46b2e16102e8782eb675b518b7d870dc8d007ba8 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py @@ -0,0 +1,163 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import numpy as np +from typing import Dict, List, Optional, Tuple +import torch +from torch import nn + +from detectron2.config import configurable +from detectron2.structures import ImageList, Instances +from detectron2.utils.events import get_event_storage + +from detectron2.modeling.backbone import Backbone, build_backbone +from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY + +from detectron2.modeling.meta_arch import GeneralizedRCNN + +from detectron2.modeling.postprocessing import detector_postprocess +from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image +from contextlib import contextmanager +from itertools import count + +@META_ARCH_REGISTRY.register() +class VLGeneralizedRCNN(GeneralizedRCNN): + """ + Generalized R-CNN. Any models that contains the following three components: + 1. Per-image feature extraction (aka backbone) + 2. Region proposal generation + 3. Per-region feature extraction and prediction + """ + + def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + + * image: Tensor, image in (C, H, W) format. + * instances (optional): groundtruth :class:`Instances` + * proposals (optional): :class:`Instances`, precomputed proposals. + + Other information that's included in the original dicts, such as: + + * "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "instances" whose value is a :class:`Instances`. + The :class:`Instances` object has the following keys: + "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" + """ + if not self.training: + return self.inference(batched_inputs) + + images = self.preprocess_image(batched_inputs) + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + + # features = self.backbone(images.tensor) + input = self.get_batch(batched_inputs, images) + features = self.backbone(input) + + if self.proposal_generator is not None: + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + proposal_losses = {} + + _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) + if self.vis_period > 0: + storage = get_event_storage() + if storage.iter % self.vis_period == 0: + self.visualize_training(batched_inputs, proposals) + + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + def inference( + self, + batched_inputs: List[Dict[str, torch.Tensor]], + detected_instances: Optional[List[Instances]] = None, + do_postprocess: bool = True, + ): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + detected_instances (None or list[Instances]): if not None, it + contains an `Instances` object per image. The `Instances` + object contains "pred_boxes" and "pred_classes" which are + known boxes in the image. + The inference will then skip the detection of bounding boxes, + and only predict other per-ROI outputs. + do_postprocess (bool): whether to apply post-processing on the outputs. + + Returns: + When do_postprocess=True, same as in :meth:`forward`. + Otherwise, a list[Instances] containing raw network outputs. + """ + assert not self.training + + images = self.preprocess_image(batched_inputs) + # features = self.backbone(images.tensor) + input = self.get_batch(batched_inputs, images) + features = self.backbone(input) + + if detected_instances is None: + if self.proposal_generator is not None: + proposals, _ = self.proposal_generator(images, features, None) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + + results, _ = self.roi_heads(images, features, proposals, None) + else: + detected_instances = [x.to(self.device) for x in detected_instances] + results = self.roi_heads.forward_with_given_boxes(features, detected_instances) + + if do_postprocess: + assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess." + return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) + else: + return results + + def get_batch(self, examples, images): + if len(examples) >= 1 and "bbox" not in examples[0]: # image_only + return {"images": images.tensor} + + return input + + def _batch_inference(self, batched_inputs, detected_instances=None): + """ + Execute inference on a list of inputs, + using batch size = self.batch_size (e.g., 2), instead of the length of the list. + + Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference` + """ + if detected_instances is None: + detected_instances = [None] * len(batched_inputs) + + outputs = [] + inputs, instances = [], [] + for idx, input, instance in zip(count(), batched_inputs, detected_instances): + inputs.append(input) + instances.append(instance) + if len(inputs) == 2 or idx == len(batched_inputs) - 1: + outputs.extend( + self.inference( + inputs, + instances if instances[0] is not None else None, + do_postprocess=True, # False + ) + ) + inputs, instances = [], [] + return outputs diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8185984e66f0267be6368317c60dc543dcb69e87 --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py @@ -0,0 +1,1236 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import colorsys +import logging +import math +import numpy as np +from enum import Enum, unique +import cv2 +import matplotlib as mpl +import matplotlib.colors as mplc +import matplotlib.figure as mplfigure +import pycocotools.mask as mask_util +import torch +from matplotlib.backends.backend_agg import FigureCanvasAgg +from PIL import Image + +from detectron2.data import MetadataCatalog +from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes +from detectron2.utils.file_io import PathManager + +from detectron2.utils.colormap import random_color + +import pdb + +logger = logging.getLogger(__name__) + +__all__ = ["ColorMode", "VisImage", "Visualizer"] + + +_SMALL_OBJECT_AREA_THRESH = 1000 +_LARGE_MASK_AREA_THRESH = 120000 +_OFF_WHITE = (1.0, 1.0, 240.0 / 255) +_BLACK = (0, 0, 0) +_RED = (1.0, 0, 0) + +_KEYPOINT_THRESHOLD = 0.05 + +#CLASS_NAMES = ["footnote", "footer", "header"] + +@unique +class ColorMode(Enum): + """ + Enum of different color modes to use for instance visualizations. + """ + + IMAGE = 0 + """ + Picks a random color for every instance and overlay segmentations with low opacity. + """ + SEGMENTATION = 1 + """ + Let instances of the same category have similar colors + (from metadata.thing_colors), and overlay them with + high opacity. This provides more attention on the quality of segmentation. + """ + IMAGE_BW = 2 + """ + Same as IMAGE, but convert all areas without masks to gray-scale. + Only available for drawing per-instance mask predictions. + """ + + +class GenericMask: + """ + Attribute: + polygons (list[ndarray]): list[ndarray]: polygons for this mask. + Each ndarray has format [x, y, x, y, ...] + mask (ndarray): a binary mask + """ + + def __init__(self, mask_or_polygons, height, width): + self._mask = self._polygons = self._has_holes = None + self.height = height + self.width = width + + m = mask_or_polygons + if isinstance(m, dict): + # RLEs + assert "counts" in m and "size" in m + if isinstance(m["counts"], list): # uncompressed RLEs + h, w = m["size"] + assert h == height and w == width + m = mask_util.frPyObjects(m, h, w) + self._mask = mask_util.decode(m)[:, :] + return + + if isinstance(m, list): # list[ndarray] + self._polygons = [np.asarray(x).reshape(-1) for x in m] + return + + if isinstance(m, np.ndarray): # assumed to be a binary mask + assert m.shape[1] != 2, m.shape + assert m.shape == ( + height, + width, + ), f"mask shape: {m.shape}, target dims: {height}, {width}" + self._mask = m.astype("uint8") + return + + raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m))) + + @property + def mask(self): + if self._mask is None: + self._mask = self.polygons_to_mask(self._polygons) + return self._mask + + @property + def polygons(self): + if self._polygons is None: + self._polygons, self._has_holes = self.mask_to_polygons(self._mask) + return self._polygons + + @property + def has_holes(self): + if self._has_holes is None: + if self._mask is not None: + self._polygons, self._has_holes = self.mask_to_polygons(self._mask) + else: + self._has_holes = False # if original format is polygon, does not have holes + return self._has_holes + + def mask_to_polygons(self, mask): + # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level + # hierarchy. External contours (boundary) of the object are placed in hierarchy-1. + # Internal contours (holes) are placed in hierarchy-2. + # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours. + mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr + res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) + hierarchy = res[-1] + if hierarchy is None: # empty mask + return [], False + has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0 + res = res[-2] + res = [x.flatten() for x in res] + # These coordinates from OpenCV are integers in range [0, W-1 or H-1]. + # We add 0.5 to turn them into real-value coordinate space. A better solution + # would be to first +0.5 and then dilate the returned polygon by 0.5. + res = [x + 0.5 for x in res if len(x) >= 6] + return res, has_holes + + def polygons_to_mask(self, polygons): + rle = mask_util.frPyObjects(polygons, self.height, self.width) + rle = mask_util.merge(rle) + return mask_util.decode(rle)[:, :] + + def area(self): + return self.mask.sum() + + def bbox(self): + p = mask_util.frPyObjects(self.polygons, self.height, self.width) + p = mask_util.merge(p) + bbox = mask_util.toBbox(p) + bbox[2] += bbox[0] + bbox[3] += bbox[1] + return bbox + + +class _PanopticPrediction: + """ + Unify different panoptic annotation/prediction formats + """ + + def __init__(self, panoptic_seg, segments_info, metadata=None): + if segments_info is None: + assert metadata is not None + # If "segments_info" is None, we assume "panoptic_img" is a + # H*W int32 image storing the panoptic_id in the format of + # category_id * label_divisor + instance_id. We reserve -1 for + # VOID label. + label_divisor = metadata.label_divisor + segments_info = [] + for panoptic_label in np.unique(panoptic_seg.numpy()): + if panoptic_label == -1: + # VOID region. + continue + pred_class = panoptic_label // label_divisor + isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values() + segments_info.append( + { + "id": int(panoptic_label), + "category_id": int(pred_class), + "isthing": bool(isthing), + } + ) + del metadata + + self._seg = panoptic_seg + + self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info + segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True) + areas = areas.numpy() + sorted_idxs = np.argsort(-areas) + self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs] + self._seg_ids = self._seg_ids.tolist() + for sid, area in zip(self._seg_ids, self._seg_areas): + if sid in self._sinfo: + self._sinfo[sid]["area"] = float(area) + + def non_empty_mask(self): + """ + Returns: + (H, W) array, a mask for all pixels that have a prediction + """ + empty_ids = [] + for id in self._seg_ids: + if id not in self._sinfo: + empty_ids.append(id) + if len(empty_ids) == 0: + return np.zeros(self._seg.shape, dtype=np.uint8) + assert ( + len(empty_ids) == 1 + ), ">1 ids corresponds to no labels. This is currently not supported" + return (self._seg != empty_ids[0]).numpy().astype(np.bool) + + def semantic_masks(self): + for sid in self._seg_ids: + sinfo = self._sinfo.get(sid) + if sinfo is None or sinfo["isthing"]: + # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions. + continue + yield (self._seg == sid).numpy().astype(np.bool), sinfo + + def instance_masks(self): + for sid in self._seg_ids: + sinfo = self._sinfo.get(sid) + if sinfo is None or not sinfo["isthing"]: + continue + mask = (self._seg == sid).numpy().astype(np.bool) + if mask.sum() > 0: + yield mask, sinfo + + +def _create_text_labels(classes, scores, class_names, is_crowd=None): + """ + Args: + classes (list[int] or None): + scores (list[float] or None): + class_names (list[str] or None): + is_crowd (list[bool] or None): + + Returns: + list[str] or None + """ + #class_names = CLASS_NAMES + labels = None + if classes is not None: + if class_names is not None and len(class_names) > 0: + labels = [class_names[i] for i in classes] + else: + labels = [str(i) for i in classes] + + if scores is not None: + if labels is None: + labels = ["{:.0f}%".format(s * 100) for s in scores] + else: + labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)] + if labels is not None and is_crowd is not None: + labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)] + return labels + + +class VisImage: + def __init__(self, img, scale=1.0): + """ + Args: + img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255]. + scale (float): scale the input image + """ + self.img = img + self.scale = scale + self.width, self.height = img.shape[1], img.shape[0] + self._setup_figure(img) + + def _setup_figure(self, img): + """ + Args: + Same as in :meth:`__init__()`. + + Returns: + fig (matplotlib.pyplot.figure): top level container for all the image plot elements. + ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system. + """ + fig = mplfigure.Figure(frameon=False) + self.dpi = fig.get_dpi() + # add a small 1e-2 to avoid precision lost due to matplotlib's truncation + # (https://github.com/matplotlib/matplotlib/issues/15363) + fig.set_size_inches( + (self.width * self.scale + 1e-2) / self.dpi, + (self.height * self.scale + 1e-2) / self.dpi, + ) + self.canvas = FigureCanvasAgg(fig) + # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig) + ax = fig.add_axes([0.0, 0.0, 1.0, 1.0]) + ax.axis("off") + self.fig = fig + self.ax = ax + self.reset_image(img) + + def reset_image(self, img): + """ + Args: + img: same as in __init__ + """ + img = img.astype("uint8") + self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest") + + def save(self, filepath): + """ + Args: + filepath (str): a string that contains the absolute path, including the file name, where + the visualized image will be saved. + """ + self.fig.savefig(filepath) + + def get_image(self): + """ + Returns: + ndarray: + the visualized image of shape (H, W, 3) (RGB) in uint8 type. + The shape is scaled w.r.t the input image using the given `scale` argument. + """ + canvas = self.canvas + s, (width, height) = canvas.print_to_buffer() + # buf = io.BytesIO() # works for cairo backend + # canvas.print_rgba(buf) + # width, height = self.width, self.height + # s = buf.getvalue() + + buffer = np.frombuffer(s, dtype="uint8") + + img_rgba = buffer.reshape(height, width, 4) + rgb, alpha = np.split(img_rgba, [3], axis=2) + return rgb.astype("uint8") + + +class Visualizer: + """ + Visualizer that draws data about detection/segmentation on images. + + It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}` + that draw primitive objects to images, as well as high-level wrappers like + `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}` + that draw composite data in some pre-defined style. + + Note that the exact visualization style for the high-level wrappers are subject to change. + Style such as color, opacity, label contents, visibility of labels, or even the visibility + of objects themselves (e.g. when the object is too small) may change according + to different heuristics, as long as the results still look visually reasonable. + + To obtain a consistent style, you can implement custom drawing functions with the + abovementioned primitive methods instead. If you need more customized visualization + styles, you can process the data yourself following their format documented in + tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not + intend to satisfy everyone's preference on drawing styles. + + This visualizer focuses on high rendering quality rather than performance. It is not + designed to be used for real-time applications. + """ + + # TODO implement a fast, rasterized version using OpenCV + + def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): + """ + Args: + img_rgb: a numpy array of shape (H, W, C), where H and W correspond to + the height and width of the image respectively. C is the number of + color channels. The image is required to be in RGB format since that + is a requirement of the Matplotlib library. The image is also expected + to be in the range [0, 255]. + metadata (Metadata): dataset metadata (e.g. class names and colors) + instance_mode (ColorMode): defines one of the pre-defined style for drawing + instances on an image. + """ + self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) + if metadata is None: + metadata = MetadataCatalog.get("__nonexist__") + self.metadata = metadata + self.output = VisImage(self.img, scale=scale) + self.cpu_device = torch.device("cpu") + + # too small texts are useless, therefore clamp to 9 + self._default_font_size = max( + np.sqrt(self.output.height * self.output.width) // 90, 10 // scale + ) + self._instance_mode = instance_mode + self.keypoint_threshold = _KEYPOINT_THRESHOLD + + def draw_instance_predictions(self, predictions): + """ + Draw instance-level prediction results on an image. + + Args: + predictions (Instances): the output of an instance detection/segmentation + model. Following fields will be used to draw: + "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). + + Returns: + output (VisImage): image object with visualizations. + """ + boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None + scores = predictions.scores if predictions.has("scores") else None + classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None + labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) + keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None + + if predictions.has("pred_masks"): + masks = np.asarray(predictions.pred_masks) + masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] + else: + masks = None + + if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): + colors = [ + self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes + ] + alpha = 0.8 + else: + colors = None + alpha = 0.5 + + if self._instance_mode == ColorMode.IMAGE_BW: + self.output.reset_image( + self._create_grayscale_image( + (predictions.pred_masks.any(dim=0) > 0).numpy() + if predictions.has("pred_masks") + else None + ) + ) + alpha = 0.3 + + self.overlay_instances( + masks=masks, + boxes=boxes, + labels=labels, + keypoints=keypoints, + assigned_colors=colors, + alpha=alpha, + ) + return self.output + + def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8): + """ + Draw semantic segmentation predictions/labels. + + Args: + sem_seg (Tensor or ndarray): the segmentation of shape (H, W). + Each value is the integer label of the pixel. + area_threshold (int): segments with less than `area_threshold` are not drawn. + alpha (float): the larger it is, the more opaque the segmentations are. + + Returns: + output (VisImage): image object with visualizations. + """ + if isinstance(sem_seg, torch.Tensor): + sem_seg = sem_seg.numpy() + labels, areas = np.unique(sem_seg, return_counts=True) + sorted_idxs = np.argsort(-areas).tolist() + labels = labels[sorted_idxs] + for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels): + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] + except (AttributeError, IndexError): + mask_color = None + + binary_mask = (sem_seg == label).astype(np.uint8) + text = self.metadata.stuff_classes[label] + self.draw_binary_mask( + binary_mask, + color=mask_color, + edge_color=_OFF_WHITE, + text=text, + alpha=alpha, + area_threshold=area_threshold, + ) + return self.output + + def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7): + """ + Draw panoptic prediction annotations or results. + + Args: + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each + segment. + segments_info (list[dict] or None): Describe each segment in `panoptic_seg`. + If it is a ``list[dict]``, each dict contains keys "id", "category_id". + If None, category id of each pixel is computed by + ``pixel // metadata.label_divisor``. + area_threshold (int): stuff segments with less than `area_threshold` are not drawn. + + Returns: + output (VisImage): image object with visualizations. + """ + pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata) + + if self._instance_mode == ColorMode.IMAGE_BW: + self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask())) + + # draw mask for all semantic segments first i.e. "stuff" + for mask, sinfo in pred.semantic_masks(): + category_idx = sinfo["category_id"] + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] + except AttributeError: + mask_color = None + + text = self.metadata.stuff_classes[category_idx] + self.draw_binary_mask( + mask, + color=mask_color, + edge_color=_OFF_WHITE, + text=text, + alpha=alpha, + area_threshold=area_threshold, + ) + + # draw mask for all instances second + all_instances = list(pred.instance_masks()) + if len(all_instances) == 0: + return self.output + masks, sinfo = list(zip(*all_instances)) + category_ids = [x["category_id"] for x in sinfo] + + try: + scores = [x["score"] for x in sinfo] + except KeyError: + scores = None + labels = _create_text_labels( + category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo] + ) + + try: + colors = [ + self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids + ] + except AttributeError: + colors = None + self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha) + + return self.output + + draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility + + def draw_dataset_dict(self, dic): + """ + Draw annotations/segmentaions in Detectron2 Dataset format. + + Args: + dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. + + Returns: + output (VisImage): image object with visualizations. + """ + annos = dic.get("annotations", None) + if annos: + if "segmentation" in annos[0]: + masks = [x["segmentation"] for x in annos] + else: + masks = None + if "keypoints" in annos[0]: + keypts = [x["keypoints"] for x in annos] + keypts = np.array(keypts).reshape(len(annos), -1, 3) + else: + keypts = None + + boxes = [ + BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) + if len(x["bbox"]) == 4 + else x["bbox"] + for x in annos + ] + + colors = None + category_ids = [x["category_id"] for x in annos] + if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): + colors = [ + self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) + for c in category_ids + ] + names = self.metadata.get("thing_classes", None) + labels = _create_text_labels( + category_ids, + scores=None, + class_names=names, + is_crowd=[x.get("iscrowd", 0) for x in annos], + ) + self.overlay_instances( + labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors + ) + + sem_seg = dic.get("sem_seg", None) + if sem_seg is None and "sem_seg_file_name" in dic: + with PathManager.open(dic["sem_seg_file_name"], "rb") as f: + sem_seg = Image.open(f) + sem_seg = np.asarray(sem_seg, dtype="uint8") + if sem_seg is not None: + self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) + + pan_seg = dic.get("pan_seg", None) + if pan_seg is None and "pan_seg_file_name" in dic: + with PathManager.open(dic["pan_seg_file_name"], "rb") as f: + pan_seg = Image.open(f) + pan_seg = np.asarray(pan_seg) + from panopticapi.utils import rgb2id + + pan_seg = rgb2id(pan_seg) + if pan_seg is not None: + segments_info = dic["segments_info"] + pan_seg = torch.tensor(pan_seg) + self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5) + return self.output + + def overlay_instances( + self, + *, + boxes=None, + labels=None, + masks=None, + keypoints=None, + assigned_colors=None, + alpha=0.5, + ): + """ + Args: + boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, + or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, + or a :class:`RotatedBoxes`, + or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image, + labels (list[str]): the text to be displayed for each instance. + masks (masks-like object): Supported types are: + + * :class:`detectron2.structures.PolygonMasks`, + :class:`detectron2.structures.BitMasks`. + * list[list[ndarray]]: contains the segmentation masks for all objects in one image. + The first level of the list corresponds to individual instances. The second + level to all the polygon that compose the instance, and the third level + to the polygon coordinates. The third level should have the format of + [x0, y0, x1, y1, ..., xn, yn] (n >= 3). + * list[ndarray]: each ndarray is a binary mask of shape (H, W). + * list[dict]: each dict is a COCO-style RLE. + keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), + where the N is the number of instances and K is the number of keypoints. + The last dimension corresponds to (x, y, visibility or score). + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + + Returns: + output (VisImage): image object with visualizations. + """ + num_instances = 0 + if boxes is not None: + boxes = self._convert_boxes(boxes) + num_instances = len(boxes) + if masks is not None: + masks = self._convert_masks(masks) + if num_instances: + assert len(masks) == num_instances + else: + num_instances = len(masks) + if keypoints is not None: + if num_instances: + assert len(keypoints) == num_instances + else: + num_instances = len(keypoints) + keypoints = self._convert_keypoints(keypoints) + if labels is not None: + assert len(labels) == num_instances + if assigned_colors is None: + assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + if num_instances == 0: + return self.output + if boxes is not None and boxes.shape[1] == 5: + return self.overlay_rotated_instances( + boxes=boxes, labels=labels, assigned_colors=assigned_colors + ) + + # Display in largest to smallest order to reduce occlusion. + areas = None + if boxes is not None: + areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) + elif masks is not None: + areas = np.asarray([x.area() for x in masks]) + + if areas is not None: + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] if boxes is not None else None + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None + assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] + keypoints = keypoints[sorted_idxs] if keypoints is not None else None + + for i in range(num_instances): + color = assigned_colors[i] + if boxes is not None: + self.draw_box(boxes[i], edge_color=color) + + if masks is not None: + for segment in masks[i].polygons: + self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) + + if labels is not None: + # first get a box + if boxes is not None: + x0, y0, x1, y1 = boxes[i] + text_pos = (x0, y0) # if drawing boxes, put text on the box corner. + horiz_align = "left" + elif masks is not None: + # skip small mask without polygon + if len(masks[i].polygons) == 0: + continue + + x0, y0, x1, y1 = masks[i].bbox() + + # draw text in the center (defined by median) when box is not drawn + # median is less sensitive to outliers. + text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] + horiz_align = "center" + else: + continue # drawing the box confidence for keypoints isn't very useful. + # for small objects, draw text at the side to avoid occlusion + instance_area = (y1 - y0) * (x1 - x0) + if ( + instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale + or y1 - y0 < 40 * self.output.scale + ): + if y1 >= self.output.height - 5: + text_pos = (x1, y0) + else: + text_pos = (x0, y1) + + height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) + * 0.5 + * self._default_font_size + ) + self.draw_text( + labels[i], + text_pos, + color=lighter_color, + horizontal_alignment=horiz_align, + font_size=font_size, + ) + + # draw keypoints + if keypoints is not None: + for keypoints_per_instance in keypoints: + self.draw_and_connect_keypoints(keypoints_per_instance) + + return self.output + + def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None): + """ + Args: + boxes (ndarray): an Nx5 numpy array of + (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image. + labels (list[str]): the text to be displayed for each instance. + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + + Returns: + output (VisImage): image object with visualizations. + """ + num_instances = len(boxes) + + if assigned_colors is None: + assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + if num_instances == 0: + return self.output + + # Display in largest to smallest order to reduce occlusion. + if boxes is not None: + areas = boxes[:, 2] * boxes[:, 3] + + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + colors = [assigned_colors[idx] for idx in sorted_idxs] + + for i in range(num_instances): + self.draw_rotated_box_with_label( + boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None + ) + + return self.output + + def draw_and_connect_keypoints(self, keypoints): + """ + Draws keypoints of an instance and follows the rules for keypoint connections + to draw lines between appropriate keypoints. This follows color heuristics for + line color. + + Args: + keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints + and the last dimension corresponds to (x, y, probability). + + Returns: + output (VisImage): image object with visualizations. + """ + visible = {} + keypoint_names = self.metadata.get("keypoint_names") + for idx, keypoint in enumerate(keypoints): + # draw keypoint + x, y, prob = keypoint + if prob > self.keypoint_threshold: + self.draw_circle((x, y), color=_RED) + if keypoint_names: + keypoint_name = keypoint_names[idx] + visible[keypoint_name] = (x, y) + + if self.metadata.get("keypoint_connection_rules"): + for kp0, kp1, color in self.metadata.keypoint_connection_rules: + if kp0 in visible and kp1 in visible: + x0, y0 = visible[kp0] + x1, y1 = visible[kp1] + color = tuple(x / 255.0 for x in color) + self.draw_line([x0, x1], [y0, y1], color=color) + + # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip + # Note that this strategy is specific to person keypoints. + # For other keypoints, it should just do nothing + try: + ls_x, ls_y = visible["left_shoulder"] + rs_x, rs_y = visible["right_shoulder"] + mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2 + except KeyError: + pass + else: + # draw line from nose to mid-shoulder + nose_x, nose_y = visible.get("nose", (None, None)) + if nose_x is not None: + self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED) + + try: + # draw line from mid-shoulder to mid-hip + lh_x, lh_y = visible["left_hip"] + rh_x, rh_y = visible["right_hip"] + except KeyError: + pass + else: + mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2 + self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED) + return self.output + + """ + Primitive drawing functions: + """ + + def draw_text( + self, + text, + position, + *, + font_size=None, + color="g", + horizontal_alignment="center", + rotation=0, + ): + """ + Args: + text (str): class label + position (tuple): a tuple of the x and y coordinates to place text on image. + font_size (int, optional): font of the text. If not provided, a font size + proportional to the image width is calculated and used. + color: color of the text. Refer to `matplotlib.colors` for full list + of formats that are accepted. + horizontal_alignment (str): see `matplotlib.text.Text` + rotation: rotation angle in degrees CCW + + Returns: + output (VisImage): image object with text drawn. + """ + if not font_size: + font_size = self._default_font_size + + # since the text background is dark, we don't want the text to be dark + color = np.maximum(list(mplc.to_rgb(color)), 0.2) + color[np.argmax(color)] = max(0.8, np.max(color)) + + x, y = position + self.output.ax.text( + x, + y, + text, + size=font_size * self.output.scale, + family="sans-serif", + bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"}, + verticalalignment="top", + horizontalalignment=horizontal_alignment, + color=color, + zorder=10, + rotation=rotation, + ) + return self.output + + def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"): + """ + Args: + box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0 + are the coordinates of the image's top left corner. x1 and y1 are the + coordinates of the image's bottom right corner. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + edge_color: color of the outline of the box. Refer to `matplotlib.colors` + for full list of formats that are accepted. + line_style (string): the string to use to create the outline of the boxes. + + Returns: + output (VisImage): image object with box drawn. + """ + x0, y0, x1, y1 = box_coord + width = x1 - x0 + height = y1 - y0 + + linewidth = max(self._default_font_size / 4, 1) + + self.output.ax.add_patch( + mpl.patches.Rectangle( + (x0, y0), + width, + height, + fill=False, + edgecolor=edge_color, + linewidth=linewidth * self.output.scale, + alpha=alpha, + linestyle=line_style, + ) + ) + return self.output + + def draw_rotated_box_with_label( + self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None + ): + """ + Draw a rotated box with label on its top-left corner. + + Args: + rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle), + where cnt_x and cnt_y are the center coordinates of the box. + w and h are the width and height of the box. angle represents how + many degrees the box is rotated CCW with regard to the 0-degree box. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + edge_color: color of the outline of the box. Refer to `matplotlib.colors` + for full list of formats that are accepted. + line_style (string): the string to use to create the outline of the boxes. + label (string): label for rotated box. It will not be rendered when set to None. + + Returns: + output (VisImage): image object with box drawn. + """ + cnt_x, cnt_y, w, h, angle = rotated_box + area = w * h + # use thinner lines when the box is small + linewidth = self._default_font_size / ( + 6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3 + ) + + theta = angle * math.pi / 180.0 + c = math.cos(theta) + s = math.sin(theta) + rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)] + # x: left->right ; y: top->down + rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect] + for k in range(4): + j = (k + 1) % 4 + self.draw_line( + [rotated_rect[k][0], rotated_rect[j][0]], + [rotated_rect[k][1], rotated_rect[j][1]], + color=edge_color, + linestyle="--" if k == 1 else line_style, + linewidth=linewidth, + ) + + if label is not None: + text_pos = rotated_rect[1] # topleft corner + + height_ratio = h / np.sqrt(self.output.height * self.output.width) + label_color = self._change_color_brightness(edge_color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size + ) + self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle) + + return self.output + + def draw_circle(self, circle_coord, color, radius=3): + """ + Args: + circle_coord (list(int) or tuple(int)): contains the x and y coordinates + of the center of the circle. + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + radius (int): radius of the circle. + + Returns: + output (VisImage): image object with box drawn. + """ + x, y = circle_coord + self.output.ax.add_patch( + mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color) + ) + return self.output + + def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None): + """ + Args: + x_data (list[int]): a list containing x values of all the points being drawn. + Length of list should match the length of y_data. + y_data (list[int]): a list containing y values of all the points being drawn. + Length of list should match the length of x_data. + color: color of the line. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + linestyle: style of the line. Refer to `matplotlib.lines.Line2D` + for a full list of formats that are accepted. + linewidth (float or None): width of the line. When it's None, + a default value will be computed and used. + + Returns: + output (VisImage): image object with line drawn. + """ + if linewidth is None: + linewidth = self._default_font_size / 3 + linewidth = max(linewidth, 1) + self.output.ax.add_line( + mpl.lines.Line2D( + x_data, + y_data, + linewidth=linewidth * self.output.scale, + color=color, + linestyle=linestyle, + ) + ) + return self.output + + def draw_binary_mask( + self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0 + ): + """ + Args: + binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and + W is the image width. Each value in the array is either a 0 or 1 value of uint8 + type. + color: color of the mask. Refer to `matplotlib.colors` for a full list of + formats that are accepted. If None, will pick a random color. + edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a + full list of formats that are accepted. + text (str): if None, will be drawn in the object's center of mass. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + area_threshold (float): a connected component small than this will not be shown. + + Returns: + output (VisImage): image object with mask drawn. + """ + if color is None: + color = random_color(rgb=True, maximum=1) + color = mplc.to_rgb(color) + + has_valid_segment = False + binary_mask = binary_mask.astype("uint8") # opencv needs uint8 + mask = GenericMask(binary_mask, self.output.height, self.output.width) + shape2d = (binary_mask.shape[0], binary_mask.shape[1]) + + if not mask.has_holes: + # draw polygons for regular masks + for segment in mask.polygons: + area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1])) + if area < (area_threshold or 0): + continue + has_valid_segment = True + segment = segment.reshape(-1, 2) + self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha) + else: + # TODO: Use Path/PathPatch to draw vector graphics: + # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon + rgba = np.zeros(shape2d + (4,), dtype="float32") + rgba[:, :, :3] = color + rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha + has_valid_segment = True + self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) + + if text is not None and has_valid_segment: + # TODO sometimes drawn on wrong objects. the heuristics here can improve. + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8) + largest_component_id = np.argmax(stats[1:, -1]) + 1 + + # draw text on the largest component, as well as other very large components. + for cid in range(1, _num_cc): + if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH: + # median is more stable than centroid + # center = centroids[largest_component_id] + center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] + self.draw_text(text, center, color=lighter_color) + return self.output + + def draw_polygon(self, segment, color, edge_color=None, alpha=0.5): + """ + Args: + segment: numpy array of shape Nx2, containing all the points in the polygon. + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a + full list of formats that are accepted. If not provided, a darker shade + of the polygon color will be used instead. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + + Returns: + output (VisImage): image object with polygon drawn. + """ + if edge_color is None: + # make edge color darker than the polygon color + if alpha > 0.8: + edge_color = self._change_color_brightness(color, brightness_factor=-0.7) + else: + edge_color = color + edge_color = mplc.to_rgb(edge_color) + (1,) + + polygon = mpl.patches.Polygon( + segment, + fill=True, + facecolor=mplc.to_rgb(color) + (alpha,), + edgecolor=edge_color, + linewidth=max(self._default_font_size // 15 * self.output.scale, 1), + ) + self.output.ax.add_patch(polygon) + return self.output + + """ + Internal methods: + """ + + def _jitter(self, color): + """ + Randomly modifies given color to produce a slightly different color than the color given. + + Args: + color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color + picked. The values in the list are in the [0.0, 1.0] range. + + Returns: + jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the + color after being jittered. The values in the list are in the [0.0, 1.0] range. + """ + color = mplc.to_rgb(color) + vec = np.random.rand(3) + # better to do it in another color space + vec = vec / np.linalg.norm(vec) * 0.5 + res = np.clip(vec + color, 0, 1) + return tuple(res) + + def _create_grayscale_image(self, mask=None): + """ + Create a grayscale version of the original image. + The colors in masked area, if given, will be kept. + """ + img_bw = self.img.astype("f4").mean(axis=2) + img_bw = np.stack([img_bw] * 3, axis=2) + if mask is not None: + img_bw[mask] = self.img[mask] + return img_bw + + def _change_color_brightness(self, color, brightness_factor): + """ + Depending on the brightness_factor, gives a lighter or darker color i.e. a color with + less or more saturation than the original color. + + Args: + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of + 0 will correspond to no change, a factor in [-1.0, 0) range will result in + a darker color and a factor in (0, 1.0] range will result in a lighter color. + + Returns: + modified_color (tuple[double]): a tuple containing the RGB values of the + modified color. Each value in the tuple is in the [0.0, 1.0] range. + """ + assert brightness_factor >= -1.0 and brightness_factor <= 1.0 + color = mplc.to_rgb(color) + polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) + modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) + modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness + modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness + modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) + return modified_color + + def _convert_boxes(self, boxes): + """ + Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension. + """ + if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes): + return boxes.tensor.detach().numpy() + else: + return np.asarray(boxes) + + def _convert_masks(self, masks_or_polygons): + """ + Convert different format of masks or polygons to a tuple of masks and polygons. + + Returns: + list[GenericMask]: + """ + + m = masks_or_polygons + if isinstance(m, PolygonMasks): + m = m.polygons + if isinstance(m, BitMasks): + m = m.tensor.numpy() + if isinstance(m, torch.Tensor): + m = m.numpy() + ret = [] + for x in m: + if isinstance(x, GenericMask): + ret.append(x) + else: + ret.append(GenericMask(x, self.output.height, self.output.width)) + return ret + + def _convert_keypoints(self, keypoints): + if isinstance(keypoints, Keypoints): + keypoints = keypoints.tensor + keypoints = np.asarray(keypoints) + return keypoints + + def get_output(self): + """ + Returns: + output (VisImage): the image output containing the visualizations added + to the image. + """ + return self.output diff --git a/magic_pdf/model/pek_sub_modules/post_process.py b/magic_pdf/model/pek_sub_modules/post_process.py new file mode 100644 index 0000000000000000000000000000000000000000..aa050b612d29849b341b2850c9c4a1bcacf904dd --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/post_process.py @@ -0,0 +1,36 @@ +import re + +def layout_rm_equation(layout_res): + rm_idxs = [] + for idx, ele in enumerate(layout_res['layout_dets']): + if ele['category_id'] == 10: + rm_idxs.append(idx) + + for idx in rm_idxs[::-1]: + del layout_res['layout_dets'][idx] + return layout_res + + +def get_croped_image(image_pil, bbox): + x_min, y_min, x_max, y_max = bbox + croped_img = image_pil.crop((x_min, y_min, x_max, y_max)) + return croped_img + + +def latex_rm_whitespace(s: str): + """Remove unnecessary whitespace from LaTeX code. + """ + text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})' + letter = '[a-zA-Z]' + noletter = '[\W_^\d]' + names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)] + s = re.sub(text_reg, lambda match: str(names.pop(0)), s) + news = s + while True: + s = news + news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s) + news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news) + news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news) + if news == s: + break + return s \ No newline at end of file diff --git a/magic_pdf/model/pek_sub_modules/self_modify.py b/magic_pdf/model/pek_sub_modules/self_modify.py new file mode 100644 index 0000000000000000000000000000000000000000..47cb591ca60dbb0644ee59072f0cb7a49459770c --- /dev/null +++ b/magic_pdf/model/pek_sub_modules/self_modify.py @@ -0,0 +1,260 @@ +import time +import copy +import base64 +import cv2 +import numpy as np +from io import BytesIO +from PIL import Image + +from paddleocr import PaddleOCR +from paddleocr.ppocr.utils.logging import get_logger +from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img +from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop +logger = get_logger() + +def img_decode(content: bytes): + np_arr = np.frombuffer(content, dtype=np.uint8) + return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED) + +def check_img(img): + if isinstance(img, bytes): + img = img_decode(img) + if isinstance(img, str): + image_file = img + img, flag_gif, flag_pdf = check_and_read(image_file) + if not flag_gif and not flag_pdf: + with open(image_file, 'rb') as f: + img_str = f.read() + img = img_decode(img_str) + if img is None: + try: + buf = BytesIO() + image = BytesIO(img_str) + im = Image.open(image) + rgb = im.convert('RGB') + rgb.save(buf, 'jpeg') + buf.seek(0) + image_bytes = buf.read() + data_base64 = str(base64.b64encode(image_bytes), + encoding="utf-8") + image_decode = base64.b64decode(data_base64) + img_array = np.frombuffer(image_decode, np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + except: + logger.error("error in loading image:{}".format(image_file)) + return None + if img is None: + logger.error("error in loading image:{}".format(image_file)) + return None + if isinstance(img, np.ndarray) and len(img.shape) == 2: + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + + return img + +def sorted_boxes(dt_boxes): + """ + Sort text boxes in order from top to bottom, left to right + args: + dt_boxes(array):detected text boxes with shape [4, 2] + return: + sorted boxes(array) with shape [4, 2] + """ + num_boxes = dt_boxes.shape[0] + sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0])) + _boxes = list(sorted_boxes) + + for i in range(num_boxes - 1): + for j in range(i, -1, -1): + if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \ + (_boxes[j + 1][0][0] < _boxes[j][0][0]): + tmp = _boxes[j] + _boxes[j] = _boxes[j + 1] + _boxes[j + 1] = tmp + else: + break + return _boxes + + +def formula_in_text(mf_bbox, text_bbox): + x1, y1, x2, y2 = mf_bbox + x3, y3 = text_bbox[0] + x4, y4 = text_bbox[2] + left_box, right_box = None, None + same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2 + if not same_line: + return False, left_box, right_box + else: + drop_origin = False + left_x = x1 - 1 + right_x = x2 + 1 + if x3 < x1 and x2 < x4: + drop_origin = True + left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32') + right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32') + if x3 < x1 and x1 <= x4 <= x2: + drop_origin = True + left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32') + if x1 <= x3 <= x2 and x2 < x4: + drop_origin = True + right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32') + if x1 <= x3 < x4 <= x2: + drop_origin = True + return drop_origin, left_box, right_box + + +def update_det_boxes(dt_boxes, mfdetrec_res): + new_dt_boxes = dt_boxes + for mf_box in mfdetrec_res: + flag, left_box, right_box = False, None, None + for idx, text_box in enumerate(new_dt_boxes): + ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box) + if ret: + new_dt_boxes.pop(idx) + if left_box is not None: + new_dt_boxes.append(left_box) + if right_box is not None: + new_dt_boxes.append(right_box) + break + + return new_dt_boxes + +class ModifiedPaddleOCR(PaddleOCR): + def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)): + """ + OCR with PaddleOCR + args: + img: img for OCR, support ndarray, img_path and list or ndarray + det: use text detection or not. If False, only rec will be exec. Default is True + rec: use text recognition or not. If False, only det will be exec. Default is True + cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False. + bin: binarize image to black and white. Default is False. + inv: invert image colors. Default is False. + alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white. + """ + assert isinstance(img, (np.ndarray, list, str, bytes)) + if isinstance(img, list) and det == True: + logger.error('When input a list of images, det must be false') + exit(0) + if cls == True and self.use_angle_cls == False: + pass + # logger.warning( + # 'Since the angle classifier is not initialized, it will not be used during the forward process' + # ) + + img = check_img(img) + # for infer pdf file + if isinstance(img, list): + if self.page_num > len(img) or self.page_num == 0: + self.page_num = len(img) + imgs = img[:self.page_num] + else: + imgs = [img] + + def preprocess_image(_image): + _image = alpha_to_color(_image, alpha_color) + if inv: + _image = cv2.bitwise_not(_image) + if bin: + _image = binarize_img(_image) + return _image + + if det and rec: + ocr_res = [] + for idx, img in enumerate(imgs): + img = preprocess_image(img) + dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res) + if not dt_boxes and not rec_res: + ocr_res.append(None) + continue + tmp_res = [[box.tolist(), res] + for box, res in zip(dt_boxes, rec_res)] + ocr_res.append(tmp_res) + return ocr_res + elif det and not rec: + ocr_res = [] + for idx, img in enumerate(imgs): + img = preprocess_image(img) + dt_boxes, elapse = self.text_detector(img) + if not dt_boxes: + ocr_res.append(None) + continue + tmp_res = [box.tolist() for box in dt_boxes] + ocr_res.append(tmp_res) + return ocr_res + else: + ocr_res = [] + cls_res = [] + for idx, img in enumerate(imgs): + if not isinstance(img, list): + img = preprocess_image(img) + img = [img] + if self.use_angle_cls and cls: + img, cls_res_tmp, elapse = self.text_classifier(img) + if not rec: + cls_res.append(cls_res_tmp) + rec_res, elapse = self.text_recognizer(img) + ocr_res.append(rec_res) + if not rec: + return cls_res + return ocr_res + + def __call__(self, img, cls=True, mfd_res=None): + time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0} + + if img is None: + logger.debug("no valid image provided") + return None, None, time_dict + + start = time.time() + ori_im = img.copy() + dt_boxes, elapse = self.text_detector(img) + time_dict['det'] = elapse + + if dt_boxes is None: + logger.debug("no dt_boxes found, elapsed : {}".format(elapse)) + end = time.time() + time_dict['all'] = end - start + return None, None, time_dict + else: + logger.debug("dt_boxes num : {}, elapsed : {}".format( + len(dt_boxes), elapse)) + img_crop_list = [] + + dt_boxes = sorted_boxes(dt_boxes) + if mfd_res: + bef = time.time() + dt_boxes = update_det_boxes(dt_boxes, mfd_res) + aft = time.time() + logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format( + len(dt_boxes), aft-bef)) + + for bno in range(len(dt_boxes)): + tmp_box = copy.deepcopy(dt_boxes[bno]) + if self.args.det_box_type == "quad": + img_crop = get_rotate_crop_image(ori_im, tmp_box) + else: + img_crop = get_minarea_rect_crop(ori_im, tmp_box) + img_crop_list.append(img_crop) + if self.use_angle_cls and cls: + img_crop_list, angle_list, elapse = self.text_classifier( + img_crop_list) + time_dict['cls'] = elapse + logger.debug("cls num : {}, elapsed : {}".format( + len(img_crop_list), elapse)) + + rec_res, elapse = self.text_recognizer(img_crop_list) + time_dict['rec'] = elapse + logger.debug("rec_res num : {}, elapsed : {}".format( + len(rec_res), elapse)) + if self.args.save_crop_res: + self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, + rec_res) + filter_boxes, filter_rec_res = [], [] + for box, rec_result in zip(dt_boxes, rec_res): + text, score = rec_result + if score >= self.drop_score: + filter_boxes.append(box) + filter_rec_res.append(rec_result) + end = time.time() + time_dict['all'] = end - start + return filter_boxes, filter_rec_res, time_dict \ No newline at end of file diff --git a/magic_pdf/model/pp_structure_v2.py b/magic_pdf/model/pp_structure_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..49b593086f8d915e3d8e40c740fcfbd857ae6330 --- /dev/null +++ b/magic_pdf/model/pp_structure_v2.py @@ -0,0 +1,87 @@ +import random + +from loguru import logger + +try: + from paddleocr import PPStructure +except ImportError: + logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"') + exit(1) + + +def region_to_bbox(region): + x0 = region[0][0] + y0 = region[0][1] + x1 = region[2][0] + y1 = region[2][1] + return [x0, y0, x1, y1] + + +class CustomPaddleModel: + def __init__(self, ocr: bool = False, show_log: bool = False): + self.model = PPStructure(table=False, ocr=ocr, show_log=show_log) + + def __call__(self, img): + try: + import cv2 + except ImportError: + logger.error("opencv-python not installed, please install by pip.") + exit(1) + # 将RGB图片转换为BGR格式适配paddle + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + result = self.model(img) + spans = [] + for line in result: + line.pop("img") + """ + 为paddle输出适配type no. + title: 0 # 标题 + text: 1 # 文本 + header: 2 # abandon + footer: 2 # abandon + reference: 1 # 文本 or abandon + equation: 8 # 行间公式 block + equation: 14 # 行间公式 text + figure: 3 # 图片 + figure_caption: 4 # 图片描述 + table: 5 # 表格 + table_caption: 6 # 表格描述 + """ + if line["type"] == "title": + line["category_id"] = 0 + elif line["type"] in ["text", "reference"]: + line["category_id"] = 1 + elif line["type"] == "figure": + line["category_id"] = 3 + elif line["type"] == "figure_caption": + line["category_id"] = 4 + elif line["type"] == "table": + line["category_id"] = 5 + elif line["type"] == "table_caption": + line["category_id"] = 6 + elif line["type"] == "equation": + line["category_id"] = 8 + elif line["type"] in ["header", "footer"]: + line["category_id"] = 2 + else: + logger.warning(f"unknown type: {line['type']}") + + # 兼容不输出score的paddleocr版本 + if line.get("score") is None: + line["score"] = 0.5 + random.random() * 0.5 + + res = line.pop("res", None) + if res is not None and len(res) > 0: + for span in res: + new_span = { + "category_id": 15, + "bbox": region_to_bbox(span["text_region"]), + "score": span["confidence"], + "text": span["text"], + } + spans.append(new_span) + + if len(spans) > 0: + result.extend(spans) + + return result diff --git a/magic_pdf/para/__init__.py b/magic_pdf/para/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/para/block_continuation_processor.py b/magic_pdf/para/block_continuation_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..b4aa59d7728f733ed74abb627b52fe8cca5650ac --- /dev/null +++ b/magic_pdf/para/block_continuation_processor.py @@ -0,0 +1,562 @@ +import os +import unicodedata + +from magic_pdf.para.commons import * + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class BlockContinuationProcessor: + """ + This class is used to process the blocks to detect block continuations. + """ + + def __init__(self) -> None: + pass + + def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3): + """ + This function checks if the two font types are similar. + Definition of similar font types: the two font types have a common prefix, + and the length of the common prefix is at least a certain ratio of the length of the shorter font type. + + Parameters + ---------- + font_type1 : str + font type 1 + font_type2 : str + font type 2 + prefix_length_ratio : float + minimum ratio of the common prefix length to the length of the shorter font type + + Returns + ------- + bool + True if the two font types are similar, False otherwise. + """ + + if isinstance(font_type1, list): + font_type1 = font_type1[0] if font_type1 else "" + if isinstance(font_type2, list): + font_type2 = font_type2[0] if font_type2 else "" + + if font_type1 == font_type2: + return True + + # Find the length of the common prefix + common_prefix_length = len(os.path.commonprefix([font_type1, font_type2])) + + # Calculate the minimum prefix length based on the ratio + min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio) + + return common_prefix_length >= min_prefix_length + + def __is_same_block_font(self, block1, block2): + """ + This function compares the font of block1 and block2 + + Parameters + ---------- + block1 : dict + block1 + block2 : dict + block2 + + Returns + ------- + is_same : bool + True if block1 and block2 have the same font, else False + """ + block_1_font_type = safe_get(block1, "block_font_type", "") + block_1_font_size = safe_get(block1, "block_font_size", 0) + block_1_avg_char_width = safe_get(block1, "avg_char_width", 0) + + block_2_font_type = safe_get(block2, "block_font_type", "") + block_2_font_size = safe_get(block2, "block_font_size", 0) + block_2_avg_char_width = safe_get(block2, "avg_char_width", 0) + + if isinstance(block_1_font_size, list): + block_1_font_size = block_1_font_size[0] if block_1_font_size else 0 + if isinstance(block_2_font_size, list): + block_2_font_size = block_2_font_size[0] if block_2_font_size else 0 + + block_1_text = safe_get(block1, "text", "") + block_2_text = safe_get(block2, "text", "") + + if block_1_avg_char_width == 0 or block_2_avg_char_width == 0: + return False + + if not block_1_text or not block_2_text: + return False + else: + text_len_ratio = len(block_2_text) / len(block_1_text) + if text_len_ratio < 0.2: + avg_char_width_condition = ( + abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width) + < 0.5 + ) + else: + avg_char_width_condition = ( + abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width) + < 0.2 + ) + + block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1 + + return ( + self.__is_similar_font_type(block_1_font_type, block_2_font_type) + and avg_char_width_condition + and block_font_size_condtion + ) + + def _is_alphabet_char(self, char): + if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"): + return True + else: + return False + + def _is_chinese_char(self, char): + if char >= "\u4e00" and char <= "\u9fa5": + return True + else: + return False + + def _is_other_letter_char(self, char): + try: + cat = unicodedata.category(char) + if cat == "Lu" or cat == "Ll": + return not self._is_alphabet_char(char) and not self._is_chinese_char(char) + except TypeError: + print("The input to the function must be a single character.") + return False + + def _is_year(self, s: str): + try: + number = int(s) + return 1900 <= number <= 2099 + except ValueError: + return False + + def __is_para_font_consistent(self, para_1, para_2): + """ + This function compares the font of para1 and para2 + + Parameters + ---------- + para1 : dict + para1 + para2 : dict + para2 + + Returns + ------- + is_same : bool + True if para1 and para2 have the same font, else False + """ + if para_1 is None or para_2 is None: + return False + + para_1_font_type = safe_get(para_1, "para_font_type", "") + para_1_font_size = safe_get(para_1, "para_font_size", 0) + para_1_font_color = safe_get(para_1, "para_font_color", "") + + para_2_font_type = safe_get(para_2, "para_font_type", "") + para_2_font_size = safe_get(para_2, "para_font_size", 0) + para_2_font_color = safe_get(para_2, "para_font_color", "") + + if isinstance(para_1_font_type, list): # get the most common font type + para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count) + if isinstance(para_2_font_type, list): + para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count) + if isinstance(para_1_font_size, list): # compute average font type + para_1_font_size = sum(para_1_font_size) / len(para_1_font_size) + if isinstance(para_2_font_size, list): # compute average font type + para_2_font_size = sum(para_2_font_size) / len(para_2_font_size) + + return ( + self.__is_similar_font_type(para_1_font_type, para_2_font_type) + and abs(para_1_font_size - para_2_font_size) < 1.5 + # and para_font_color1 == para_font_color2 + ) + + def _is_para_puncs_consistent(self, para_1, para_2): + """ + This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter) + + Parameters + ---------- + para1 : dict + para1 + para2 : dict + para2 + + Returns + ------- + is_same : bool + True if para1 and para2 are from the same paragraph by using the puncs, else False + """ + para_1_text = safe_get(para_1, "para_text", "").strip() + para_2_text = safe_get(para_2, "para_text", "").strip() + + para_1_bboxes = safe_get(para_1, "para_bbox", []) + para_1_font_sizes = safe_get(para_1, "para_font_size", 0) + + para_2_bboxes = safe_get(para_2, "para_bbox", []) + para_2_font_sizes = safe_get(para_2, "para_font_size", 0) + + # print_yellow(" Features of determine puncs_consistent:") + # print(f" para_1_text: {para_1_text}") + # print(f" para_2_text: {para_2_text}") + # print(f" para_1_bboxes: {para_1_bboxes}") + # print(f" para_2_bboxes: {para_2_bboxes}") + # print(f" para_1_font_sizes: {para_1_font_sizes}") + # print(f" para_2_font_sizes: {para_2_font_sizes}") + + if is_nested_list(para_1_bboxes): + x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1] + else: + x0_1, y0_1, x1_1, y1_1 = para_1_bboxes + + if is_nested_list(para_2_bboxes): + x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0] + para_2_font_sizes = para_2_font_sizes[0] # type: ignore + else: + x0_2, y0_2, x1_2, y1_2 = para_2_bboxes + + right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8 + are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold + + left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8 + is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold + is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold + + # Check if either para_text1 or para_text2 is empty + if not para_1_text or not para_2_text: + return False + + # Define the end puncs for a sentence to end and hyphen + end_puncs = [".", "?", "!", "。", "?", "!", "…"] + hyphen = ["-", "—"] + + # Check if para_text1 ends with either hyphen or non-end punctuation or spaces + para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen + para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs + para_1_end_with_space = para_1_text and para_1_text[-1] == " " + para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs + + # print_yellow(f" para_1_end_with_hyphen: {para_1_end_with_hyphen}") + # print_yellow(f" para_1_end_with_end_punc: {para_1_end_with_end_punc}") + # print_yellow(f" para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}") + # print_yellow(f" para_1_end_with_space: {para_1_end_with_space}") + + if para_1_end_with_hyphen: # If para_text1 ends with hyphen + # print_red(f"para_1 is end with hyphen.") + para_2_is_consistent = para_2_text and ( + para_2_text[0] in hyphen + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower()) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + # print(f"para_2 is not consistent.\n") + pass + + elif para_1_end_with_end_punc: # If para_text1 ends with ending punctuations + # print_red(f"para_1 is end with end_punc.") + para_2_is_consistent = ( + para_2_text + and ( + para_2_text[0] == " " + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper()) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + ) + and not is_para2_left_indent_than_papa1 + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + # print(f"para_2 is not consistent.\n") + pass + + elif para_1_not_end_with_end_punc: # If para_text1 is not end with ending punctuations + # print_red(f"para_1 is NOT end with end_punc.") + para_2_is_consistent = para_2_text and ( + para_2_text[0] == " " + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower()) + or (self._is_alphabet_char(para_2_text[0])) + or (self._is_year(para_2_text[0:4])) + or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + # print(f"para_2 is not consistent.\n") + pass + + elif para_1_end_with_space: # If para_text1 ends with space + # print_red(f"para_1 is end with space.") + para_2_is_consistent = para_2_text and ( + para_2_text[0] == " " + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower()) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + pass + # print(f"para_2 is not consistent.\n") + + return False + + def _is_block_consistent(self, block1, block2): + """ + This function determines whether block1 and block2 are originally from the same block + + Parameters + ---------- + block1 : dict + block1s + block2 : dict + block2 + + Returns + ------- + is_same : bool + True if block1 and block2 are from the same block, else False + """ + return self.__is_same_block_font(block1, block2) + + def _is_para_continued(self, para1, para2): + """ + This function determines whether para1 and para2 are originally from the same paragraph + + Parameters + ---------- + para1 : dict + para1 + para2 : dict + para2 + + Returns + ------- + is_same : bool + True if para1 and para2 are from the same paragraph, else False + """ + is_para_font_consistent = self.__is_para_font_consistent(para1, para2) + is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2) + + return is_para_font_consistent and is_para_puncs_consistent + + def _are_boundaries_of_block_consistent(self, block1, block2): + """ + This function checks if the boundaries of block1 and block2 are consistent + + Parameters + ---------- + block1 : dict + block1 + + block2 : dict + block2 + + Returns + ------- + is_consistent : bool + True if the boundaries of block1 and block2 are consistent, else False + """ + + last_line_of_block1 = block1["lines"][-1] + first_line_of_block2 = block2["lines"][0] + + spans_of_last_line_of_block1 = last_line_of_block1["spans"] + spans_of_first_line_of_block2 = first_line_of_block2["spans"] + + font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower() + font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"] + font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"] + font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"] + + font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower() + font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"] + font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"] + font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"] + + return ( + self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2) + and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1 + # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2 + and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2 + ) + + def _get_last_paragraph(self, block): + """ + Retrieves the last paragraph from a block. + + Parameters + ---------- + block : dict + The block from which to retrieve the paragraph. + + Returns + ------- + dict + The last paragraph of the block. + """ + if block["paras"]: + last_para_key = list(block["paras"].keys())[-1] + return block["paras"][last_para_key] + else: + return None + + def _get_first_paragraph(self, block): + """ + Retrieves the first paragraph from a block. + + Parameters + ---------- + block : dict + The block from which to retrieve the paragraph. + + Returns + ------- + dict + The first paragraph of the block. + """ + if block["paras"]: + first_para_key = list(block["paras"].keys())[0] + return block["paras"][first_para_key] + else: + return None + + def should_merge_next_para(self, curr_para, next_para): + if self._is_para_continued(curr_para, next_para): + return True + else: + return False + + def batch_tag_paras(self, pdf_dict): + the_last_page_id = len(pdf_dict) - 1 + + for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()): + if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []): + para_blocks_of_curr_page = curr_page_content["para_blocks"] + next_page_idx = curr_page_idx + 1 + next_page_id = f"page_{next_page_idx}" + next_page_content = pdf_dict.get(next_page_id, {}) + + for i, current_block in enumerate(para_blocks_of_curr_page): + for para_id, curr_para in current_block["paras"].items(): + curr_para["curr_para_location"] = [ + curr_page_idx, + current_block["block_id"], + int(para_id.split("_")[-1]), + ] + curr_para["next_para_location"] = None # 默认设置为None + curr_para["merge_next_para"] = False # 默认设置为False + + next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None + + if next_block: + curr_block_last_para_key = list(current_block["paras"].keys())[-1] + curr_blk_last_para = current_block["paras"][curr_block_last_para_key] + + next_block_first_para_key = list(next_block["paras"].keys())[0] + next_blk_first_para = next_block["paras"][next_block_first_para_key] + + if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para): + curr_blk_last_para["next_para_location"] = [ + curr_page_idx, + next_block["block_id"], + int(next_block_first_para_key.split("_")[-1]), + ] + curr_blk_last_para["merge_next_para"] = True + else: + # Handle the case where the next block is in a different page + curr_block_last_para_key = list(current_block["paras"].keys())[-1] + curr_blk_last_para = current_block["paras"][curr_block_last_para_key] + + while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id: + next_page_idx += 1 + next_page_id = f"page_{next_page_idx}" + next_page_content = pdf_dict.get(next_page_id, {}) + + if next_page_content.get("para_blocks", []): + next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0] + next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key] + + if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para): + curr_blk_last_para["next_para_location"] = [ + next_page_idx, + next_page_content["para_blocks"][0]["block_id"], + int(next_blk_first_para_key.split("_")[-1]), + ] + curr_blk_last_para["merge_next_para"] = True + + return pdf_dict + + def find_block_by_id(self, para_blocks, block_id): + for block in para_blocks: + if block.get("block_id") == block_id: + return block + return None + + def batch_merge_paras(self, pdf_dict): + for page_id, page_content in pdf_dict.items(): + if page_id.startswith("page_") and page_content.get("para_blocks", []): + para_blocks_of_page = page_content["para_blocks"] + + for i in range(len(para_blocks_of_page)): + current_block = para_blocks_of_page[i] + paras = current_block["paras"] + + for para_id, curr_para in list(paras.items()): + # 跳过标题段落 + if curr_para.get("is_para_title"): + continue + + while curr_para.get("merge_next_para"): + next_para_location = curr_para.get("next_para_location") + if not next_para_location: + break + + next_page_idx, next_block_id, next_para_id = next_para_location + next_page_id = f"page_{next_page_idx}" + next_page_content = pdf_dict.get(next_page_id) + if not next_page_content: + break + + next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id) + if not next_block: + break + + next_para = next_block["paras"].get(f"para_{next_para_id}") + if not next_para or next_para.get("is_para_title"): + break + + # 合并段落文本 + curr_para_text = curr_para.get("para_text", "") + next_para_text = next_para.get("para_text", "") + curr_para["para_text"] = curr_para_text + " " + next_para_text + + # 更新 next_para_location + curr_para["next_para_location"] = next_para.get("next_para_location") + + # 将下一个段落文本置为空,表示已被合并 + next_para["para_text"] = "" + + # 更新 merge_next_para 标记 + curr_para["merge_next_para"] = next_para.get("merge_next_para", False) + + return pdf_dict diff --git a/magic_pdf/para/block_termination_processor.py b/magic_pdf/para/block_termination_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..54b393caa54d29ef5a49f0ebcb28d7914cc09356 --- /dev/null +++ b/magic_pdf/para/block_termination_processor.py @@ -0,0 +1,480 @@ +from magic_pdf.para.commons import * + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + + +class BlockTerminationProcessor: + def __init__(self) -> None: + pass + + def _is_consistent_lines( + self, + curr_line, + prev_line, + next_line, + consistent_direction, # 0 for prev, 1 for next, 2 for both + ): + """ + This function checks if the line is consistent with its neighbors + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + next_line : dict + next line + consistent_direction : int + 0 for prev, 1 for next, 2 for both + + Returns + ------- + bool + True if the line is consistent with its neighbors, False otherwise. + """ + + curr_line_font_size = curr_line["spans"][0]["size"] + curr_line_font_type = curr_line["spans"][0]["font"].lower() + + if consistent_direction == 0: + if prev_line: + prev_line_font_size = prev_line["spans"][0]["size"] + prev_line_font_type = prev_line["spans"][0]["font"].lower() + return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type + else: + return False + + elif consistent_direction == 1: + if next_line: + next_line_font_size = next_line["spans"][0]["size"] + next_line_font_type = next_line["spans"][0]["font"].lower() + return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type + else: + return False + + elif consistent_direction == 2: + if prev_line and next_line: + prev_line_font_size = prev_line["spans"][0]["size"] + prev_line_font_type = prev_line["spans"][0]["font"].lower() + next_line_font_size = next_line["spans"][0]["size"] + next_line_font_type = next_line["spans"][0]["font"].lower() + return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and ( + curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type + ) + else: + return False + + else: + return False + + def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height): + """ + This function checks if the line is a regular line + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + X0 : float + median of x0 values, which represents the left average boundary of the page + X1 : float + median of x1 values, which represents the right average boundary of the page + avg_line_height : float + average of line heights + + Returns + ------- + bool + True if the line is a regular line, False otherwise. + """ + horizontal_ratio = 0.5 + vertical_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + vertical_thres = vertical_ratio * avg_line_height + + x0, y0, x1, y1 = curr_line_bbox + + x0_near_X0 = abs(x0 - X0) < horizontal_thres + x1_near_X1 = abs(x1 - X1) < horizontal_thres + + prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width) + + sufficient_spacing_above = False + if prev_line_bbox: + vertical_spacing_above = y1 - prev_line_bbox[3] + sufficient_spacing_above = vertical_spacing_above > vertical_thres + + sufficient_spacing_below = False + if next_line_bbox: + vertical_spacing_below = next_line_bbox[1] - y0 + sufficient_spacing_below = vertical_spacing_below > vertical_thres + + return ( + (sufficient_spacing_above or sufficient_spacing_below) + or (not x0_near_X0 and not x1_near_X1) + or prev_line_is_end_of_para + ) + + def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size): + """ + This function checks if the line is a possible start of a paragraph + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + next_line : dict + next line + X0 : float + median of x0 values, which represents the left average boundary of the page + X1 : float + median of x1 values, which represents the right average boundary of the page + avg_char_width : float + average of char widths + avg_line_height : float + average of line heights + + Returns + ------- + bool + True if the line is a possible start of a paragraph, False otherwise. + """ + start_confidence = 0.5 # Initial confidence of the line being a start of a paragraph + decision_path = [] # Record the decision path + + curr_line_bbox = curr_line["bbox"] + prev_line_bbox = prev_line["bbox"] if prev_line else None + next_line_bbox = next_line["bbox"] if next_line else None + + indent_ratio = 1 + + vertical_ratio = 1.5 + vertical_thres = vertical_ratio * avg_font_size + + left_horizontal_ratio = 0.5 + left_horizontal_thres = left_horizontal_ratio * avg_char_width + + right_horizontal_ratio = 2.5 + right_horizontal_thres = right_horizontal_ratio * avg_char_width + + x0, y0, x1, y1 = curr_line_bbox + + indent_condition = x0 > X0 + indent_ratio * avg_char_width + if indent_condition: + start_confidence += 0.2 + decision_path.append("indent_condition_met") + + x0_near_X0 = abs(x0 - X0) < left_horizontal_thres + if x0_near_X0: + start_confidence += 0.1 + decision_path.append("x0_near_X0") + + x1_near_X1 = abs(x1 - X1) < right_horizontal_thres + if x1_near_X1: + start_confidence += 0.1 + decision_path.append("x1_near_X1") + + if prev_line is None: + prev_line_is_end_of_para = True + start_confidence += 0.2 + decision_path.append("no_prev_line") + else: + prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width) + if prev_line_is_end_of_para: + start_confidence += 0.1 + decision_path.append("prev_line_is_end_of_para") + + sufficient_spacing_above = False + if prev_line_bbox: + vertical_spacing_above = y1 - prev_line_bbox[3] + sufficient_spacing_above = vertical_spacing_above > vertical_thres + if sufficient_spacing_above: + start_confidence += 0.2 + decision_path.append("sufficient_spacing_above") + + sufficient_spacing_below = False + if next_line_bbox: + vertical_spacing_below = next_line_bbox[1] - y0 + sufficient_spacing_below = vertical_spacing_below > vertical_thres + if sufficient_spacing_below: + start_confidence += 0.2 + decision_path.append("sufficient_spacing_below") + + is_regular_line = self._is_regular_line( + curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size + ) + if is_regular_line: + start_confidence += 0.1 + decision_path.append("is_regular_line") + + is_start_of_para = ( + (sufficient_spacing_above or sufficient_spacing_below) + or (indent_condition) + or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line) + or prev_line_is_end_of_para + ) + return (is_start_of_para, start_confidence, decision_path) + + def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width): + """ + This function checks if the line is a possible end of a paragraph + + Parameters + ---------- + curr_line : dict + current line + next_line : dict + next line + X0 : float + median of x0 values, which represents the left average boundary of the page + X1 : float + median of x1 values, which represents the right average boundary of the page + avg_char_width : float + average of char widths + + Returns + ------- + bool + True if the line is a possible end of a paragraph, False otherwise. + """ + + end_confidence = 0.5 # Initial confidence of the line being a end of a paragraph + decision_path = [] # Record the decision path + + curr_line_bbox = curr_line["bbox"] + next_line_bbox = next_line["bbox"] if next_line else None + + left_horizontal_ratio = 0.5 + right_horizontal_ratio = 0.5 + + x0, _, x1, y1 = curr_line_bbox + next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0) + + x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width + if x0_near_X0: + end_confidence += 0.1 + decision_path.append("x0_near_X0") + + x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width + if x1_smaller_than_X1: + end_confidence += 0.1 + decision_path.append("x1_smaller_than_X1") + + next_line_is_start_of_para = ( + next_line_bbox + and (next_x0 > X0 + left_horizontal_ratio * avg_char_width) + and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1)) + ) + if next_line_is_start_of_para: + end_confidence += 0.2 + decision_path.append("next_line_is_start_of_para") + + is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors( + curr_line_bbox, None, next_line_bbox, avg_char_width + ) + if is_line_left_aligned_from_neighbors_bool: + end_confidence += 0.1 + decision_path.append("line_is_left_aligned_from_neighbors") + + is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors( + curr_line_bbox, None, next_line_bbox, avg_char_width + ) + if not is_line_right_aligned_from_neighbors_bool: + end_confidence += 0.1 + decision_path.append("line_is_not_right_aligned_from_neighbors") + + is_end_of_para = end_with_punctuation(curr_line["text"]) and ( + (x0_near_X0 and x1_smaller_than_X1) + or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool) + ) + + return (is_end_of_para, end_confidence, decision_path) + + def _cut_paras_per_block( + self, + block, + ): + """ + Processes a raw block from PyMuPDF and returns the processed block. + + Parameters + ---------- + raw_block : dict + A raw block from pymupdf. + + Returns + ------- + processed_block : dict + + """ + + def _construct_para(lines, is_block_title, para_title_level): + """ + Construct a paragraph from given lines. + """ + + font_sizes = [span["size"] for line in lines for span in line["spans"]] + avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0 + + font_colors = [span["color"] for line in lines for span in line["spans"]] + most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None + + # font_types = [span["font"] for line in lines for span in line["spans"]] + # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None + + font_type_lengths = {} + for line in lines: + for span in line["spans"]: + font_type = span["font"] + bbox_width = span["bbox"][2] - span["bbox"][0] + if font_type in font_type_lengths: + font_type_lengths[font_type] += bbox_width + else: + font_type_lengths[font_type] = bbox_width + + # get the font type with the longest bbox width + most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None # type: ignore + + para_bbox = calculate_para_bbox(lines) + para_text = " ".join(line["text"] for line in lines) + + return { + "para_bbox": para_bbox, + "para_text": para_text, + "para_font_type": most_common_font_type, + "para_font_size": avg_font_size, + "para_font_color": most_common_font_color, + "is_para_title": is_block_title, + "para_title_level": para_title_level, + } + + block_bbox = block["bbox"] + block_text = block["text"] + block_lines = block["lines"] + + X0 = safe_get(block, "X0", 0) + X1 = safe_get(block, "X1", 0) + avg_char_width = safe_get(block, "avg_char_width", 0) + avg_char_height = safe_get(block, "avg_char_height", 0) + avg_font_size = safe_get(block, "avg_font_size", 0) + + is_block_title = safe_get(block, "is_block_title", False) + para_title_level = safe_get(block, "block_title_level", 0) + + # Segment into paragraphs + para_ranges = [] + in_paragraph = False + start_idx_of_para = None + + # Create the processed paragraphs + processed_paras = {} + para_bboxes = [] + end_idx_of_para = 0 + + for line_index, line in enumerate(block_lines): + curr_line = line + prev_line = block_lines[line_index - 1] if line_index > 0 else None + next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None + + """ + Start processing paragraphs. + """ + + # Check if the line is the start of a paragraph + is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para( + curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size + ) + if not in_paragraph and is_start_of_para: + in_paragraph = True + start_idx_of_para = line_index + + # print_green(">>> Start of a paragraph") + # print(" curr_line_text: ", curr_line["text"]) + # print(" start_confidence: ", start_confidence) + # print(" decision_path: ", decision_path) + + # Check if the line is the end of a paragraph + is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para( + curr_line, next_line, X0, X1, avg_char_width + ) + if in_paragraph and (is_end_of_para or not next_line): + para_ranges.append((start_idx_of_para, line_index)) + start_idx_of_para = None + in_paragraph = False + + # print_red(">>> End of a paragraph") + # print(" curr_line_text: ", curr_line["text"]) + # print(" end_confidence: ", end_confidence) + # print(" decision_path: ", decision_path) + + # Add the last paragraph if it is not added + if in_paragraph and start_idx_of_para is not None: + para_ranges.append((start_idx_of_para, len(block_lines) - 1)) + + # Process the matched paragraphs + for para_index, (start_idx, end_idx) in enumerate(para_ranges): + matched_lines = block_lines[start_idx : end_idx + 1] + para_properties = _construct_para(matched_lines, is_block_title, para_title_level) + para_key = f"para_{len(processed_paras)}" + processed_paras[para_key] = para_properties + para_bboxes.append(para_properties["para_bbox"]) + end_idx_of_para = end_idx + 1 + + # Deal with the remaining lines + if end_idx_of_para < len(block_lines): + unmatched_lines = block_lines[end_idx_of_para:] + unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level) + unmatched_key = f"para_{len(processed_paras)}" + processed_paras[unmatched_key] = unmatched_properties + para_bboxes.append(unmatched_properties["para_bbox"]) + + block["paras"] = processed_paras + + return block + + def batch_process_blocks(self, pdf_dict): + """ + Parses the blocks of all pages. + + Parameters + ---------- + pdf_dict : dict + PDF dictionary. + filter_blocks : list + List of bounding boxes to filter. + + Returns + ------- + result_dict : dict + Result dictionary. + + """ + + num_paras = 0 + + for page_id, page in pdf_dict.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "para_blocks" in page.keys(): + input_blocks = page["para_blocks"] + for input_block in input_blocks: + new_block = self._cut_paras_per_block(input_block) + para_blocks.append(new_block) + num_paras += len(new_block["paras"]) + + page["para_blocks"] = para_blocks + + pdf_dict["statistics"]["num_paras"] = num_paras + return pdf_dict diff --git a/magic_pdf/para/commons.py b/magic_pdf/para/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..716f3074476021fd5a7da32deb4da40fa92956a1 --- /dev/null +++ b/magic_pdf/para/commons.py @@ -0,0 +1,222 @@ +import sys + +from magic_pdf.libs.commons import fitz +from termcolor import cprint + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +def open_pdf(pdf_path): + try: + pdf_document = fitz.open(pdf_path) # type: ignore + return pdf_document + except Exception as e: + print(f"无法打开PDF文件:{pdf_path}。原因是:{e}") + raise e + + +def print_green_on_red(text): + cprint(text, "green", "on_red", attrs=["bold"], end="\n\n") + + +def print_green(text): + print() + cprint(text, "green", attrs=["bold"], end="\n\n") + + +def print_red(text): + print() + cprint(text, "red", attrs=["bold"], end="\n\n") + + +def print_yellow(text): + print() + cprint(text, "yellow", attrs=["bold"], end="\n\n") + + +def safe_get(dict_obj, key, default): + val = dict_obj.get(key) + if val is None: + return default + else: + return val + + +def is_bbox_overlap(bbox1, bbox2): + """ + This function checks if bbox1 and bbox2 overlap or not + + Parameters + ---------- + bbox1 : list + bbox1 + bbox2 : list + bbox2 + + Returns + ------- + bool + True if bbox1 and bbox2 overlap, else False + """ + x0_1, y0_1, x1_1, y1_1 = bbox1 + x0_2, y0_2, x1_2, y1_2 = bbox2 + + if x0_1 > x1_2 or x0_2 > x1_1: + return False + if y0_1 > y1_2 or y0_2 > y1_1: + return False + + return True + + +def is_in_bbox(bbox1, bbox2): + """ + This function checks if bbox1 is in bbox2 + + Parameters + ---------- + bbox1 : list + bbox1 + bbox2 : list + bbox2 + + Returns + ------- + bool + True if bbox1 is in bbox2, else False + """ + x0_1, y0_1, x1_1, y1_1 = bbox1 + x0_2, y0_2, x1_2, y1_2 = bbox2 + + if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2: + return True + else: + return False + + +def calculate_para_bbox(lines): + """ + This function calculates the minimum bbox of the paragraph + + Parameters + ---------- + lines : list + lines + + Returns + ------- + para_bbox : list + bbox of the paragraph + """ + x0 = min(line["bbox"][0] for line in lines) + y0 = min(line["bbox"][1] for line in lines) + x1 = max(line["bbox"][2] for line in lines) + y1 = max(line["bbox"][3] for line in lines) + return [x0, y0, x1, y1] + + +def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2): + """ + This function checks if the line is right aligned from its neighbors + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + direction : int + 0 for prev, 1 for next, 2 for both + + Returns + ------- + bool + True if the line is right aligned from its neighbors, False otherwise. + """ + horizontal_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + + _, _, x1, _ = curr_line_bbox + _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0) + _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0) + + if direction == 0: + return abs(x1 - prev_x1) < horizontal_thres + elif direction == 1: + return abs(x1 - next_x1) < horizontal_thres + elif direction == 2: + return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres + else: + return False + + +def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2): + """ + This function checks if the line is left aligned from its neighbors + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + direction : int + 0 for prev, 1 for next, 2 for both + + Returns + ------- + bool + True if the line is left aligned from its neighbors, False otherwise. + """ + horizontal_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + + x0, _, _, _ = curr_line_bbox + prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0) + next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0) + + if direction == 0: + return abs(x0 - prev_x0) < horizontal_thres + elif direction == 1: + return abs(x0 - next_x0) < horizontal_thres + elif direction == 2: + return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres + else: + return False + + +def end_with_punctuation(line_text): + """ + This function checks if the line ends with punctuation marks + """ + + english_end_puncs = [".", "?", "!"] + chinese_end_puncs = ["。", "?", "!"] + end_puncs = english_end_puncs + chinese_end_puncs + + last_non_space_char = None + for ch in line_text[::-1]: + if not ch.isspace(): + last_non_space_char = ch + break + + if last_non_space_char is None: + return False + + return last_non_space_char in end_puncs + + +def is_nested_list(lst): + if isinstance(lst, list): + return any(isinstance(sub, list) for sub in lst) + return False diff --git a/magic_pdf/para/denoise.py b/magic_pdf/para/denoise.py new file mode 100644 index 0000000000000000000000000000000000000000..2d49f3834e25b6b9d7c07203810ec9fe8a6618a5 --- /dev/null +++ b/magic_pdf/para/denoise.py @@ -0,0 +1,246 @@ +import math + +from collections import defaultdict +from magic_pdf.para.commons import * + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class HeaderFooterProcessor: + def __init__(self) -> None: + pass + + def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2): + """ + This function gets the most common bboxes from the bboxes + + Parameters + ---------- + bboxes : list + bboxes + page_height : float + height of the page + position : str, optional + "top" or "bottom", by default "top" + threshold : float, optional + threshold, by default 0.25 + num_bboxes : int, optional + number of bboxes to return, by default 3 + min_frequency : int, optional + minimum frequency of the bbox, by default 2 + + Returns + ------- + common_bboxes : list + common bboxes + """ + # Filter bbox by position + if position == "top": + filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold] + else: + filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)] + + # Find the most common bbox + bbox_count = defaultdict(int) + for bbox in filtered_bboxes: + bbox_count[tuple(bbox)] += 1 + + # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency + common_bboxes = [ + bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency + ][:num_bboxes] + return common_bboxes + + def detect_footer_header(self, result_dict, similarity_threshold=0.5): + """ + This function detects the header and footer of the document. + + Parameters + ---------- + result_dict : dict + result dictionary + + Returns + ------- + result_dict : dict + result dictionary + """ + + def compare_bbox_with_list(bbox, bbox_list, tolerance=1): + return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list) + + def is_single_line_block(block): + # Determine based on the width and height of the block + block_width = block["X1"] - block["X0"] + block_height = block["bbox"][3] - block["bbox"][1] + + # If the height of the block is close to the average character height and the width is large, it is considered a single line + return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3 + + # Traverse all blocks in the document + single_preproc_blocks = 0 + total_blocks = 0 + single_preproc_blocks = 0 + + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_key, block in blocks.items(): + if block_key.startswith("block_"): + total_blocks += 1 + if is_single_line_block(block): + single_preproc_blocks += 1 + + # If there are no blocks, skip the header and footer detection + if total_blocks == 0: + print("No blocks found. Skipping header/footer detection.") + return result_dict + + # If most of the blocks are single-line, skip the header and footer detection + if single_preproc_blocks / total_blocks > 0.5: # 50% of the blocks are single-line + return result_dict + + # Collect the bounding boxes of all blocks + all_bboxes = [] + all_texts = [] + + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_key, block in blocks.items(): + if block_key.startswith("block_"): + all_bboxes.append(block["bbox"]) + + # Get the height of the page + page_height = max(bbox[3] for bbox in all_bboxes) + + # Get the most common bbox lists for headers and footers + common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else [] + common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else [] + + # Detect and mark headers and footers + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_key, block in blocks.items(): + if block_key.startswith("block_"): + bbox = block["bbox"] + text = block["text"] + + is_header = compare_bbox_with_list(bbox, common_header_bboxes) + is_footer = compare_bbox_with_list(bbox, common_footer_bboxes) + + block["is_header"] = int(is_header) + block["is_footer"] = int(is_footer) + + return result_dict + + +class NonHorizontalTextProcessor: + def __init__(self) -> None: + pass + + def detect_non_horizontal_texts(self, result_dict): + """ + This function detects watermarks and vertical margin notes in the document. + + Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages. + If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page. + If the direction of these blocks is not horizontal, they are definitely considered to be watermarks. + + Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages. + If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. + If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes. + + + Parameters + ---------- + result_dict : dict + The result dictionary. + + Returns + ------- + result_dict : dict + The updated result dictionary. + """ + # Dictionary to store information about potential watermarks + potential_watermarks = {} + potential_margin_notes = {} + + for page_id, page_content in result_dict.items(): + if page_id.startswith("page_"): + for block_id, block_data in page_content.items(): + if block_id.startswith("block_"): + if "dir" in block_data: + coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text + + angle = math.atan2(block_data["dir"][1], block_data["dir"][0]) + angle = abs(math.degrees(angle)) + + if angle > 5 and angle < 85: # Check if direction is watermarks + if coordinates_text in potential_watermarks: + potential_watermarks[coordinates_text] += 1 + else: + potential_watermarks[coordinates_text] = 1 + + if angle > 85 and angle < 105: # Check if direction is vertical + if coordinates_text in potential_margin_notes: + potential_margin_notes[coordinates_text] += 1 # Increment count + else: + potential_margin_notes[coordinates_text] = 1 # Initialize count + + # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages) + watermark_threshold = len(result_dict) // 2 + watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold} + + # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages) + margin_note_threshold = len(result_dict) // 2 + margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold} + + # Add watermark information to the result dictionary + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_id, block_data in blocks.items(): + coordinates_text = (block_data["bbox"], block_data["text"]) + if coordinates_text in watermarks: + block_data["is_watermark"] = 1 + else: + block_data["is_watermark"] = 0 + + if coordinates_text in margin_notes: + block_data["is_vertical_margin_note"] = 1 + else: + block_data["is_vertical_margin_note"] = 0 + + return result_dict + + +class NoiseRemover: + def __init__(self) -> None: + pass + + def skip_data_noises(self, result_dict): + """ + This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title + """ + filtered_result_dict = {} + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + filtered_blocks = {} + for block_id, block in blocks.items(): + if block_id.startswith("block_"): + if any( + block.get(key, 0) + for key in [ + "is_overlap", + "is_header", + "is_footer", + "is_watermark", + "is_vertical_margin_note", + "is_block_title", + ] + ): + continue + filtered_blocks[block_id] = block + if filtered_blocks: + filtered_result_dict[page_id] = filtered_blocks + + return filtered_result_dict diff --git a/magic_pdf/para/draw.py b/magic_pdf/para/draw.py new file mode 100644 index 0000000000000000000000000000000000000000..041a21bcbeb4522b43e3ea1b70cba0ae857323a6 --- /dev/null +++ b/magic_pdf/para/draw.py @@ -0,0 +1,121 @@ +from magic_pdf.libs.commons import fitz + +from magic_pdf.para.commons import * + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class DrawAnnos: + """ + This class draws annotations on the pdf file + + ---------------------------------------- + Color Code + ---------------------------------------- + Red: (1, 0, 0) + Green: (0, 1, 0) + Blue: (0, 0, 1) + Yellow: (1, 1, 0) - mix of red and green + Cyan: (0, 1, 1) - mix of green and blue + Magenta: (1, 0, 1) - mix of red and blue + White: (1, 1, 1) - red, green and blue full intensity + Black: (0, 0, 0) - no color component whatsoever + Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components + Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component + """ + + def __init__(self) -> None: + pass + + def __is_nested_list(self, lst): + """ + This function returns True if the given list is a nested list of any degree. + """ + if isinstance(lst, list): + return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst) + return False + + def __valid_rect(self, bbox): + # Ensure that the rectangle is not empty or invalid + if isinstance(bbox[0], list): + return False # It's a nested list, hence it can't be valid rect + else: + return bbox[0] < bbox[2] and bbox[1] < bbox[3] + + def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)): + """ + This function draws the nested boxes + + Parameters + ---------- + page : fitz.Page + page + nested_bbox : list + nested bbox + color : tuple + color, by default (0, 1, 1) # draw with cyan color for combined paragraph + """ + if self.__is_nested_list(nested_bbox): # If it's a nested list + for bbox in nested_bbox: + self.__draw_nested_boxes(page, bbox, color) # Recursively call the function + elif self.__valid_rect(nested_bbox): # If valid rectangle + para_rect = fitz.Rect(nested_bbox) + para_anno = page.add_rect_annot(para_rect) + para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph + para_anno.set_border(width=1) + para_anno.update() + + def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path): + pdf_doc = open_pdf(input_pdf_path) + + if pdf_dic is None: + pdf_dic = {} + + if output_pdf_path is None: + output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf") + + for page_id, page in enumerate(pdf_doc): # type: ignore + page_key = f"page_{page_id}" + for ele_key, ele_data in pdf_dic[page_key].items(): + if ele_key == "para_blocks": + para_blocks = ele_data + for para_block in para_blocks: + if "paras" in para_block.keys(): + paras = para_block["paras"] + for para_key, para_content in paras.items(): + para_bbox = para_content["para_bbox"] + # print(f"para_bbox: {para_bbox}") + # print(f"is a nested list: {self.__is_nested_list(para_bbox)}") + if self.__is_nested_list(para_bbox) and len(para_bbox) > 1: + color = (0, 1, 1) + self.__draw_nested_boxes( + page, para_bbox, color + ) # draw with cyan color for combined paragraph + else: + if self.__valid_rect(para_bbox): + para_rect = fitz.Rect(para_bbox) + para_anno = page.add_rect_annot(para_rect) + para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph + para_anno.set_border(width=0.5) + para_anno.update() + + is_para_title = para_content["is_para_title"] + if is_para_title: + if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1: + color = (0, 0, 1) + self.__draw_nested_boxes( + page, para_content["para_bbox"], color + ) # draw with cyan color for combined title + else: + if self.__valid_rect(para_content["para_bbox"]): + para_rect = fitz.Rect(para_content["para_bbox"]) + if self.__valid_rect(para_content["para_bbox"]): + para_anno = page.add_rect_annot(para_rect) + para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title + para_anno.set_border(width=0.5) + para_anno.update() + + pdf_doc.save(output_pdf_path) + pdf_doc.close() diff --git a/magic_pdf/para/exceptions.py b/magic_pdf/para/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..75b19fac480ec1f1fa914f06e85228a67b3a9d7a --- /dev/null +++ b/magic_pdf/para/exceptions.py @@ -0,0 +1,198 @@ +class DenseSingleLineBlockException(Exception): + """ + This class defines the exception type for dense single line-block. + """ + + def __init__(self, message="DenseSingleLineBlockException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class TitleDetectionException(Exception): + """ + This class defines the exception type for title detection. + """ + + def __init__(self, message="TitleDetectionException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class TitleLevelException(Exception): + """ + This class defines the exception type for title level. + """ + + def __init__(self, message="TitleLevelException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class ParaSplitException(Exception): + """ + This class defines the exception type for paragraph splitting. + """ + + def __init__(self, message="ParaSplitException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class ParaMergeException(Exception): + """ + This class defines the exception type for paragraph merging. + """ + + def __init__(self, message="ParaMergeException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class DiscardByException: + """ + This class discards pdf files by exception + """ + + def __init__(self) -> None: + pass + + def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException): + """ + This function discards pdf files by single line block exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + exception_page_nums = 0 + page_num = 0 + for page_id, page in pdf_dic.items(): + if page_id.startswith("page_"): + page_num += 1 + if "preproc_blocks" in page.keys(): + preproc_blocks = page["preproc_blocks"] + + all_single_line_blocks = [] + for block in preproc_blocks: + if len(block["lines"]) == 1: + all_single_line_blocks.append(block) + + if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9: + exception_page_nums += 1 + + if page_num == 0: + return None + + if exception_page_nums / page_num > 0.1: # Low ratio means basically, whenever this is the case, it is discarded + return exception.message + + return None + + def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException): + """ + This function discards pdf files by title detection exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + def discard_by_title_level(self, pdf_dic, exception: TitleLevelException): + """ + This function discards pdf files by title level exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + def discard_by_split_para(self, pdf_dic, exception: ParaSplitException): + """ + This function discards pdf files by split para exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException): + """ + This function discards pdf files by merge para exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None diff --git a/magic_pdf/para/layout_match_processor.py b/magic_pdf/para/layout_match_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..4f93f1a872e183b7f45c23244579dca97a92617a --- /dev/null +++ b/magic_pdf/para/layout_match_processor.py @@ -0,0 +1,40 @@ +import math +from magic_pdf.para.commons import * + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class LayoutFilterProcessor: + def __init__(self) -> None: + pass + + def batch_process_blocks(self, pdf_dict): + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys(): + layout_bbox_objs = blocks["layout_bboxes"] + if layout_bbox_objs is None: + continue + layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs] + + # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox + layout_bboxes = [ + [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes + ] + + para_blocks = blocks["para_blocks"] + if para_blocks is None: + continue + + for lb_bbox in layout_bboxes: + for i, para_block in enumerate(para_blocks): + para_bbox = para_block["bbox"] + para_blocks[i]["in_layout"] = 0 + if is_in_bbox(para_bbox, lb_bbox): + para_blocks[i]["in_layout"] = 1 + + blocks["para_blocks"] = para_blocks + + return pdf_dict diff --git a/magic_pdf/para/para_pipeline.py b/magic_pdf/para/para_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..13911c1a7e10c37ceba568fcf8f8a379050e4f70 --- /dev/null +++ b/magic_pdf/para/para_pipeline.py @@ -0,0 +1,297 @@ +import os +import json + +from magic_pdf.para.commons import * + +from magic_pdf.para.raw_processor import RawBlockProcessor +from magic_pdf.para.layout_match_processor import LayoutFilterProcessor +from magic_pdf.para.stats import BlockStatisticsCalculator +from magic_pdf.para.stats import DocStatisticsCalculator +from magic_pdf.para.title_processor import TitleProcessor +from magic_pdf.para.block_termination_processor import BlockTerminationProcessor +from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor +from magic_pdf.para.draw import DrawAnnos +from magic_pdf.para.exceptions import ( + DenseSingleLineBlockException, + TitleDetectionException, + TitleLevelException, + ParaSplitException, + ParaMergeException, + DiscardByException, +) + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class ParaProcessPipeline: + def __init__(self) -> None: + pass + + def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None): + """ + This function processes the paragraphs, including: + 1. Read raw input json file into pdf_dic + 2. Detect and replace equations + 3. Combine spans into a natural line + 4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key + 5. Compute statistics for each block + 6. Detect titles in the document + 7. Detect paragraphs inside each block + 8. Divide the level of the titles + 9. Detect and combine paragraphs from different blocks into one paragraph + 10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable. + 11. Draw annotations on the pdf file + + Parameters + ---------- + pdf_dic_json_fpath : str + path to the pdf dictionary json file. + Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already. + input_pdf_doc : str + path to the input pdf file + output_pdf_path : str + path to the output pdf file + + Returns + ------- + pdf_dict : dict + result dictionary + """ + + error_info = None + + output_json_file = "" + output_dir = "" + + if input_pdf_path is not None: + input_pdf_path = os.path.abspath(input_pdf_path) + + # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}") + + if output_pdf_path is not None: + output_dir = os.path.dirname(output_pdf_path) + output_json_file = f"{output_dir}/pdf_dic.json" + + def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode): + """ + Save the pdf_dic to a json file + """ + output_pdf_file_name = os.path.basename(output_pdf_path) + # output_dir = os.path.dirname(output_pdf_path) + output_dir = "\\tmp\\pdf_parse" + output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json") + pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if para_debug_mode == "full": + with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f: + json.dump(pdf_dic, f, indent=2, ensure_ascii=False) + + # Validate the output already exists + if not os.path.exists(pdf_dic_json_fpath): + print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}") + return None + else: + print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}") + + return pdf_dic_json_fpath + + """ + Preprocess the lines of block + """ + # Find and replace the interline and inline equations, should be better done before the paragraph processing + # Create "para_blocks" for each page. + # equationProcessor = EquationsProcessor() + # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict) + + # Combine spans into a natural line + rawBlockProcessor = RawBlockProcessor() + pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict) + # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n") + + # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key + layoutFilter = LayoutFilterProcessor() + pdf_dic = layoutFilter.batch_process_blocks(pdf_dic) + + # Compute statistics for each block + blockStatisticsCalculator = BlockStatisticsCalculator() + pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic) + # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n") + + # Compute statistics for all blocks(namely this pdf document) + docStatisticsCalculator = DocStatisticsCalculator() + pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic) + # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n") + + # Dump the first three stages of pdf_dic to a json file + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode) + + """ + Detect titles in the document + """ + doc_statistics = pdf_dic["statistics"] + titleProcessor = TitleProcessor(doc_statistics) + pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode) + + """ + Detect and divide the level of the titles + """ + titleProcessor = TitleProcessor() + + pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode) + + """ + Detect and split paragraphs inside each block + """ + blockInnerParasProcessor = BlockTerminationProcessor() + + pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode) + + # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full") + # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}") + + """ + Detect and combine paragraphs from different blocks into one paragraph + """ + blockContinuationProcessor = BlockContinuationProcessor() + + pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic) + pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode) + + # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full") + # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}") + + """ + Discard pdf files by checking exceptions and return the error info to the caller + """ + discardByException = DiscardByException() + + is_discard_by_single_line_block = discardByException.discard_by_single_line_block( + pdf_dic, exception=DenseSingleLineBlockException() + ) + is_discard_by_title_detection = discardByException.discard_by_title_detection( + pdf_dic, exception=TitleDetectionException() + ) + is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException()) + is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException()) + is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException()) + + """ + if any( + info is not None + for info in [ + is_discard_by_single_line_block, + is_discard_by_title_detection, + is_discard_by_title_level, + is_discard_by_split_para, + is_discard_by_merge_para, + ] + ): + error_info = next( + ( + info + for info in [ + is_discard_by_single_line_block, + is_discard_by_title_detection, + is_discard_by_title_level, + is_discard_by_split_para, + is_discard_by_merge_para, + ] + if info is not None + ), + None, + ) + return pdf_dic, error_info + + if any( + info is not None + for info in [ + is_discard_by_single_line_block, + is_discard_by_title_detection, + is_discard_by_title_level, + is_discard_by_split_para, + is_discard_by_merge_para, + ] + ): + error_info = next( + ( + info + for info in [ + is_discard_by_single_line_block, + is_discard_by_title_detection, + is_discard_by_title_level, + is_discard_by_split_para, + is_discard_by_merge_para, + ] + if info is not None + ), + None, + ) + return pdf_dic, error_info + """ + + """ + Dump the final pdf_dic to a json file + """ + if para_debug_mode is not None: + with open(output_json_file, "w", encoding="utf-8") as f: + json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) + + """ + Draw the annotations + """ + + if is_discard_by_single_line_block is not None: + error_info = is_discard_by_single_line_block + elif is_discard_by_title_detection is not None: + error_info = is_discard_by_title_detection + elif is_discard_by_title_level is not None: + error_info = is_discard_by_title_level + elif is_discard_by_split_para is not None: + error_info = is_discard_by_split_para + elif is_discard_by_merge_para is not None: + error_info = is_discard_by_merge_para + + if error_info is not None: + return pdf_dic, error_info + + """ + Dump the final pdf_dic to a json file + """ + if para_debug_mode is not None: + with open(output_json_file, "w", encoding="utf-8") as f: + json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) + + """ + Draw the annotations + """ + if para_debug_mode is not None: + drawAnnos = DrawAnnos() + drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path) + + """ + Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple + """ + if para_debug_mode is not None: + for fpath in os.listdir(output_dir): + if fpath.endswith(".json") and "stage" in fpath: + os.remove(os.path.join(output_dir, fpath)) + + return pdf_dic, error_info diff --git a/magic_pdf/para/para_split.py b/magic_pdf/para/para_split.py new file mode 100644 index 0000000000000000000000000000000000000000..c9808abeda4f2130882eb7a40c611002fdb7e9dc --- /dev/null +++ b/magic_pdf/para/para_split.py @@ -0,0 +1,644 @@ +from sklearn.cluster import DBSCAN +import numpy as np +from loguru import logger + +from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout +from magic_pdf.libs.ocr_content_type import ContentType + + +LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?',":", ":", ")", ")", ";"] +INLINE_EQUATION = ContentType.InlineEquation +INTERLINE_EQUATION = ContentType.InterlineEquation +TEXT = ContentType.Text + + +def __get_span_text(span): + c = span.get('content', '') + if len(c)==0: + c = span.get('image_path', '') + + return c + + +def __detect_list_lines(lines, new_layout_bboxes, lang): + """ + 探测是否包含了列表,并且把列表的行分开. + 这样的段落特点是,顶格字母大写/数字,紧跟着几行缩进的。缩进的行首字母含小写的。 + """ + def find_repeating_patterns(lst): + indices = [] + ones_indices = [] + i = 0 + while i < len(lst) - 1: # 确保余下元素至少有2个 + if lst[i] == 1 and lst[i+1] in [2, 3]: # 额外检查以防止连续出现的1 + start = i + ones_in_this_interval = [i] + i += 1 + while i < len(lst) and lst[i] in [2, 3]: + i += 1 + # 验证下一个序列是否符合条件 + if i < len(lst) - 1 and lst[i] == 1 and lst[i+1] in [2, 3] and lst[i-1] in [2, 3]: + while i < len(lst) and lst[i] in [1, 2, 3]: + if lst[i] == 1: + ones_in_this_interval.append(i) + i += 1 + indices.append((start, i - 1)) + ones_indices.append(ones_in_this_interval) + else: + i += 1 + else: + i += 1 + return indices, ones_indices + """====================""" + def split_indices(slen, index_array): + result = [] + last_end = 0 + + for start, end in sorted(index_array): + if start > last_end: + # 前一个区间结束到下一个区间开始之间的部分标记为"text" + result.append(('text', last_end, start - 1)) + # 区间内标记为"list" + result.append(('list', start, end)) + last_end = end + 1 + + if last_end < slen: + # 如果最后一个区间结束后还有剩余的字符串,将其标记为"text" + result.append(('text', last_end, slen - 1)) + + return result + """====================""" + + if lang!='en': + return lines, None + else: + total_lines = len(lines) + line_fea_encode = [] + """ + 对每一行进行特征编码,编码规则如下: + 1. 如果行顶格,且大写字母开头或者数字开头,编码为1 + 2. 如果顶格,其他非大写开头编码为4 + 3. 如果非顶格,首字符大写,编码为2 + 4. 如果非顶格,首字符非大写编码为3 + """ + for l in lines: + first_char = __get_span_text(l['spans'][0])[0] + layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0] + if l['bbox'][0] == layout_left: + if first_char.isupper() or first_char.isdigit(): + line_fea_encode.append(1) + else: + line_fea_encode.append(4) + else: + if first_char.isupper(): + line_fea_encode.append(2) + else: + line_fea_encode.append(3) + + # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。 + + list_indice, list_start_idx = find_repeating_patterns(line_fea_encode) + if len(list_indice)>0: + logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}") + + # TODO check一下这个特列表里缩进的行左侧是不是对齐的。 + segments = [] + for start, end in list_indice: + for i in range(start, end+1): + if i>0: + if line_fea_encode[i] == 4: + logger.info(f"列表行的第{i}行不是顶格的") + break + else: + logger.info(f"列表行的第{start}到第{end}行是列表") + + return split_indices(total_lines, list_indice), list_start_idx + + + +def __valign_lines(blocks, layout_bboxes): + """ + 在一个layoutbox内对齐行的左侧和右侧。 + 扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。 + 3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。 + """ + + min_distance = 3 + min_sample = 2 + new_layout_bboxes = [] + + for layout_box in layout_bboxes: + blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])] + if len(blocks_in_layoutbox)==0: + continue + + x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']]) + x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']]) + x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst) + x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst) + x0_uniq_label = np.unique(x0_clusters.labels_) + x1_uniq_label = np.unique(x1_clusters.labels_) + + x0_2_new_val = {} # 存储旧值对应的新值映射 + x1_2_new_val = {} + for label in x0_uniq_label: + if label==-1: + continue + x0_index_of_label = np.where(x0_clusters.labels_==label) + x0_raw_val = x0_lst[x0_index_of_label][:,0] + x0_new_val = np.min(x0_lst[x0_index_of_label][:,0]) + x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val}) + for label in x1_uniq_label: + if label==-1: + continue + x1_index_of_label = np.where(x1_clusters.labels_==label) + x1_raw_val = x1_lst[x1_index_of_label][:,0] + x1_new_val = np.max(x1_lst[x1_index_of_label][:,0]) + x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val}) + + for block in blocks_in_layoutbox: + for line in block['lines']: + x0, x1 = line['bbox'][0], line['bbox'][2] + if x0 in x0_2_new_val: + line['bbox'][0] = int(x0_2_new_val[x0]) + + if x1 in x1_2_new_val: + line['bbox'][2] = int(x1_2_new_val[x1]) + # 其余对不齐的保持不动 + + # 由于修改了block里的line长度,现在需要重新计算block的bbox + for block in blocks_in_layoutbox: + block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), + min([line['bbox'][1] for line in block['lines']]), + max([line['bbox'][2] for line in block['lines']]), + max([line['bbox'][3] for line in block['lines']])] + + """新计算layout的bbox,因为block的bbox变了。""" + layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox]) + layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox]) + layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox]) + layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox]) + new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1]) + + return new_layout_bboxes + + +def __align_text_in_layout(blocks, layout_bboxes): + """ + 由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。 + """ + for layout in layout_bboxes: + lb = layout['layout_bbox'] + blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)] + if len(blocks_in_layoutbox)==0: + continue + + for block in blocks_in_layoutbox: + for line in block['lines']: + x0, x1 = line['bbox'][0], line['bbox'][2] + if x0 < lb[0]: + line['bbox'][0] = lb[0] + if x1 > lb[2]: + line['bbox'][2] = lb[2] + + +def __common_pre_proc(blocks, layout_bboxes): + """ + 不分语言的,对文本进行预处理 + """ + #__add_line_period(blocks, layout_bboxes) + __align_text_in_layout(blocks, layout_bboxes) + aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes) + + return aligned_layout_bboxes + +def __pre_proc_zh_blocks(blocks, layout_bboxes): + """ + 对中文文本进行分段预处理 + """ + pass + + +def __pre_proc_en_blocks(blocks, layout_bboxes): + """ + 对英文文本进行分段预处理 + """ + pass + + +def __group_line_by_layout(blocks, layout_bboxes, lang="en"): + """ + 每个layout内的行进行聚合 + """ + # 因为只是一个block一行目前, 一个block就是一个段落 + lines_group = [] + + for lyout in layout_bboxes: + lines = [line for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox']) for line in block['lines']] + lines_group.append(lines) + + return lines_group + + +def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_len=10): + """ + lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。 + 1. 先计算每个group的左右边界。 + 2. 然后根据行末尾特征进行分段。 + 末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。 + 且下一行开头不留空白。 + + """ + list_info = [] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾 + layout_paras = [] + right_tail_distance = 1.5 * char_avg_len + + for lines in lines_group: + paras = [] + total_lines = len(lines) + if total_lines==0: + continue # 0行无需处理 + if total_lines==1: # 1行无法分段。 + layout_paras.append([lines]) + list_info.append([False, False]) + continue + + """在进入到真正的分段之前,要对文字块从统计维度进行对齐方式的探测, + 对齐方式分为以下: + 1. 左对齐的文本块(特点是左侧顶格,或者左侧不顶格但是右侧顶格的行数大于非顶格的行数,顶格的首字母有大写也有小写) + 1) 右侧对齐的行,单独成一段 + 2) 中间对齐的行,按照字体/行高聚合成一段 + 2. 左对齐的列表块(其特点是左侧顶格的行数小于等于非顶格的行数,非定格首字母会有小写,顶格90%是大写。并且左侧顶格行数大于1,大于1是为了这种模式连续出现才能称之为列表) + 这样的文本块,顶格的为一个段落开头,紧随其后非顶格的行属于这个段落。 + """ + + text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang) + """根据list_range,把lines分成几个部分 + + """ + + layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2] + layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0] + para = [] # 元素是line + layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾 + for content_type, start, end in text_segments: + if content_type == 'list': + for i, line in enumerate(lines[start:end+1]): + line_x0 = line['bbox'][0] + if line_x0 == layout_left: # 列表开头 + if len(para)>0: + paras.append(para) + para = [] + para.append(line) + else: + para.append(line) + if len(para)>0: + paras.append(para) + para = [] + if start==0: + layout_list_info[0] = True + if end==total_lines-1: + layout_list_info[1] = True + else: # 是普通文本 + for i, line in enumerate(lines[start:end+1]): + # 如果i有下一行,那么就要根据下一行位置综合判断是否要分段。如果i之后没有行,那么只需要判断i行自己的结尾特征。 + cur_line_type = line['spans'][-1]['type'] + next_line = lines[i+1] if i= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_left: # 现在这行到了行尾沾满,下一行存在且顶格。 + para.append(line) + else: + para.append(line) + paras.append(para) + para = [] + else: # 其他,图片、表格、行间公式,各自占一段 + if len(para)>0: # 先把之前的段落加入到结果中 + paras.append(para) + para = [] + paras.append([line]) # 再把当前行加入到结果中。当前行为行间公式、图、表等。 + para = [] + + if len(para)>0: + paras.append(para) + para = [] + + list_info.append(layout_list_info) + layout_paras.append(paras) + paras = [] + + + return layout_paras, list_info + +def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang): + """ + 如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。 + 根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。 + """ + if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错 + return layout_paras, [False, False] + + for i in range(1, len(layout_paras)): + pre_layout_list_info = layout_list_info[i-1] + next_layout_list_info = layout_list_info[i] + pre_last_para = layout_paras[i-1][-1] + next_paras = layout_paras[i] + next_first_para = next_paras[0] + + if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 + logger.info(f"连接page {page_num} 内的list") + # 向layout_paras[i] 寻找开头具有相同缩进的连续的行 + may_list_lines = [] + for j in range(len(next_paras)): + line = next_paras[j] + if len(line)==1: # 只可能是一行,多行情况再需要分析了 + if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], new_layout_bbox)[0]: + may_list_lines.append(line[0]) + else: + break + else: + break + # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。 + if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1: + pre_last_para.extend(may_list_lines) + layout_paras[i] = layout_paras[i][len(may_list_lines):] + + return layout_paras, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息 + + +def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, pre_page_list_info, next_page_list_info, page_num, lang): + """ + 如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。 + 根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。 + """ + if len(pre_page_paras)==0 or len(next_page_paras)==0: # 0的时候最后的return 会出错 + return False + + if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 + logger.info(f"连接page {page_num} 内的list") + # 向layout_paras[i] 寻找开头具有相同缩进的连续的行 + may_list_lines = [] + for j in range(len(next_page_paras[0])): + line = next_page_paras[0][j] + if len(line)==1: # 只可能是一行,多行情况再需要分析了 + if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], next_page_layout_bbox)[0]: + may_list_lines.append(line[0]) + else: + break + else: + break + # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。 + if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1: + pre_page_paras[-1].append(may_list_lines) + next_page_paras[0] = next_page_paras[0][len(may_list_lines):] + return True + + return False + + +def __find_layout_bbox_by_line(line_bbox, layout_bboxes): + """ + 根据line找到所在的layout + """ + for layout in layout_bboxes: + if is_in_layout(line_bbox, layout): + return layout + return None + + +def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang): + """ + layout之间进行分段。 + 主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。 + 连接的条件需要同时满足: + 1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。 + 2. 下一行开头不留空白。 + + """ + connected_layout_paras = [] + if len(layout_paras)==0: + return connected_layout_paras + + connected_layout_paras.append(layout_paras[0]) + for i in range(1, len(layout_paras)): + try: + if len(layout_paras[i])==0 or len(layout_paras[i-1])==0: # TODO 考虑连接问题, + continue + pre_last_line = layout_paras[i-1][-1][-1] + next_first_line = layout_paras[i][0][0] + except Exception as e: + logger.error(f"page layout {i} has no line") + continue + pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) + pre_last_line_type = pre_last_line['spans'][-1]['type'] + next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']]) + next_first_line_type = next_first_line['spans'][0]['type'] + if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: + connected_layout_paras.append(layout_paras[i]) + continue + + pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[2] + next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)[0] + + pre_last_line_text = pre_last_line_text.strip() + next_first_line_text = next_first_line_text.strip() + if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。 + """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" + connected_layout_paras[-1][-1].extend(layout_paras[i][0]) + layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。 + if len(layout_paras[i])==0: + layout_paras.pop(i) + else: + connected_layout_paras.append(layout_paras[i]) + else: + """连接段落条件不成立,将前一个layout的段落加入到结果中。""" + connected_layout_paras.append(layout_paras[i]) + + return connected_layout_paras + + +def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang): + """ + 连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。 + 是否可以连接的条件: + 1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。 + 2. 后一个页面的第一个段落第一行没有空白开头。 + """ + # 有的页面可能压根没有文字 + if len(pre_page_paras)==0 or len(next_page_paras)==0 or len(pre_page_paras[0])==0 or len(next_page_paras[0])==0: # TODO [[]]为什么出现在pre_page_paras里? + return False + pre_last_para = pre_page_paras[-1][-1] + next_first_para = next_page_paras[0][0] + pre_last_line = pre_last_para[-1] + next_first_line = next_first_para[0] + pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) + pre_last_line_type = pre_last_line['spans'][-1]['type'] + next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']]) + next_first_line_type = next_first_line['spans'][0]['type'] + + if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: # TODO,真的要做好,要考虑跨table, image, 行间的情况 + # 不是文本,不连接 + return False + + pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)[2] + next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)[0] + + pre_last_line_text = pre_last_line_text.strip() + next_first_line_text = next_first_line_text.strip() + if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。 + """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" + pre_last_para.extend(next_first_para) + next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。 + return True + else: + return False + +def find_consecutive_true_regions(input_array): + start_index = None # 连续True区域的起始索引 + regions = [] # 用于保存所有连续True区域的起始和结束索引 + + for i in range(len(input_array)): + # 如果我们找到了一个True值,并且当前并没有在连续True区域中 + if input_array[i] and start_index is None: + start_index = i # 记录连续True区域的起始索引 + + # 如果我们找到了一个False值,并且当前在连续True区域中 + elif not input_array[i] and start_index is not None: + # 如果连续True区域长度大于1,那么将其添加到结果列表中 + if i - start_index > 1: + regions.append((start_index, i-1)) + start_index = None # 重置起始索引 + + # 如果最后一个元素是True,那么需要将最后一个连续True区域加入到结果列表中 + if start_index is not None and len(input_array) - start_index > 1: + regions.append((start_index, len(input_array)-1)) + + return regions + + +def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode): + """ + 找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。 + 一个line居中的条件是: + 1. 水平中心点跨越layout的中心点。 + 2. 左右两侧都有空白 + """ + + for layout_i, layout_para in enumerate(page_paras): + layout_box = new_layout_bbox[layout_i] + single_line_paras_tag = [] + for i in range(len(layout_para)): + single_line_paras_tag.append(len(layout_para[i])==1 and layout_para[i][0]['spans'][0]['type']==TEXT) + + """找出来连续的单行文本,如果连续行高度相同,那么合并为一个段落。""" + consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag) + if len(consecutive_single_line_indices)>0: + index_offset = 0 + """检查这些行是否是高度相同的,居中的""" + for start, end in consecutive_single_line_indices: + start += index_offset + end += index_offset + line_hi = np.array([line[0]['bbox'][3]-line[0]['bbox'][1] for line in layout_para[start:end+1]]) + first_line_text = ''.join([__get_span_text(span) for span in layout_para[start][0]['spans']]) + if "Table" in first_line_text or "Figure" in first_line_text: + pass + if debug_mode: + logger.debug(line_hi.std()) + + if line_hi.std()<2: + """行高度相同,那么判断是否居中""" + all_left_x0 = [line[0]['bbox'][0] for line in layout_para[start:end+1]] + all_right_x1 = [line[0]['bbox'][2] for line in layout_para[start:end+1]] + layout_center = (layout_box[0] + layout_box[2]) / 2 + if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \ + and not all([x0==layout_box[0] for x0 in all_left_x0]) \ + and not all([x1==layout_box[2] for x1 in all_right_x1]): + merge_para = [l[0] for l in layout_para[start:end+1]] + para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']]) + if debug_mode: + logger.debug(para_text) + layout_para[start:end+1] = [merge_para] + index_offset -= end-start + + return + + +def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang): + """ + 找出来连续的单行文本,如果首行顶格,接下来的几个单行段落缩进对齐,那么合并为一个段落。 + """ + + pass + + +def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang): + """ + 根据line和layout情况进行分段 + 先实现一个根据行末尾特征分段的简单方法。 + """ + """ + 算法思路: + 1. 扫描layout里每一行,找出来行尾距离layout有边界有一定距离的行。 + 2. 从上述行中找到末尾是句号等可作为断行标志的行。 + 3. 参照上述行尾特征进行分段。 + 4. 图、表,目前独占一行,不考虑分段。 + """ + if page_num==343: + pass + lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段 + layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段 + layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落 + connected_layout_paras = __connect_para_inter_layoutbox(layout_paras2, new_layout_bbox, lang) # layout间链接段落 + + + return connected_layout_paras, page_list_info + + +def para_split(pdf_info_dict, debug_mode, lang="en"): + """ + 根据line和layout情况进行分段 + """ + new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS + all_page_list_info = [] # 保存每个页面开头和结尾是否是列表 + for page_num, page in pdf_info_dict.items(): + blocks = page['preproc_blocks'] + layout_bboxes = page['layout_bboxes'] + new_layout_bbox = __common_pre_proc(blocks, layout_bboxes) + new_layout_of_pages.append(new_layout_bbox) + splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang) + all_page_list_info.append(page_list_info) + page['para_blocks'] = splited_blocks + + """连接页面与页面之间的可能合并的段落""" + pdf_infos = list(pdf_info_dict.values()) + for page_num, page in enumerate(pdf_info_dict.values()): + if page_num==0: + continue + pre_page_paras = pdf_infos[page_num-1]['para_blocks'] + next_page_paras = pdf_infos[page_num]['para_blocks'] + pre_page_layout_bbox = new_layout_of_pages[page_num-1] + next_page_layout_bbox = new_layout_of_pages[page_num] + + is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang) + if debug_mode: + if is_conn: + logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落") + + is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang) + if debug_mode: + if is_list_conn: + logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落") + + """接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接 + 1. 正文中有时出现一个行顶格,接下来几行缩进的情况。 + 2. 居中的一些连续单行,如果高度相同,那么可能是一个段落。 + """ + for page_num, page in enumerate(pdf_info_dict.values()): + page_paras = page['para_blocks'] + new_layout_bbox = new_layout_of_pages[page_num] + __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode) + __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang) diff --git a/magic_pdf/para/para_split_v2.py b/magic_pdf/para/para_split_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ec9dcee3a20d69fd3d30b1136d0528e2673a92c0 --- /dev/null +++ b/magic_pdf/para/para_split_v2.py @@ -0,0 +1,787 @@ +from sklearn.cluster import DBSCAN +import numpy as np +from loguru import logger +import re +from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout +from magic_pdf.libs.ocr_content_type import ContentType, BlockType +from magic_pdf.model.magic_model import MagicModel +from magic_pdf.libs.Constants import * + +LINE_STOP_FLAG = ['.', '!', '?', '。', '!', '?', ":", ":", ")", ")", ";"] +INLINE_EQUATION = ContentType.InlineEquation +INTERLINE_EQUATION = ContentType.InterlineEquation +TEXT = ContentType.Text +debug_able = False + + +def __get_span_text(span): + c = span.get('content', '') + if len(c) == 0: + c = span.get('image_path', '') + + return c + + +def __detect_list_lines(lines, new_layout_bboxes, lang): + global debug_able + """ + 探测是否包含了列表,并且把列表的行分开. + 这样的段落特点是,顶格字母大写/数字,紧跟着几行缩进的。缩进的行首字母含小写的。 + """ + + def find_repeating_patterns2(lst): + indices = [] + ones_indices = [] + i = 0 + while i < len(lst): # Loop through the entire list + if lst[i] == 1: # If we encounter a '1', we might be at the start of a pattern + start = i + ones_in_this_interval = [i] + i += 1 + # Traverse elements that are 1, 2 or 3, until we encounter something else + while i < len(lst) and lst[i] in [1, 2, 3]: + if lst[i] == 1: + ones_in_this_interval.append(i) + i += 1 + if len(ones_in_this_interval) > 1 or ( + start < len(lst) - 1 and ones_in_this_interval and lst[start + 1] in [2, 3]): + indices.append((start, i - 1)) + ones_indices.append(ones_in_this_interval) + else: + i += 1 + return indices, ones_indices + + def find_repeating_patterns(lst): + indices = [] + ones_indices = [] + i = 0 + while i < len(lst) - 1: # 确保余下元素至少有2个 + if lst[i] == 1 and lst[i + 1] in [2, 3]: # 额外检查以防止连续出现的1 + start = i + ones_in_this_interval = [i] + i += 1 + while i < len(lst) and lst[i] in [2, 3]: + i += 1 + # 验证下一个序列是否符合条件 + if i < len(lst) - 1 and lst[i] == 1 and lst[i + 1] in [2, 3] and lst[i - 1] in [2, 3]: + while i < len(lst) and lst[i] in [1, 2, 3]: + if lst[i] == 1: + ones_in_this_interval.append(i) + i += 1 + indices.append((start, i - 1)) + ones_indices.append(ones_in_this_interval) + else: + i += 1 + else: + i += 1 + return indices, ones_indices + + """====================""" + + def split_indices(slen, index_array): + result = [] + last_end = 0 + + for start, end in sorted(index_array): + if start > last_end: + # 前一个区间结束到下一个区间开始之间的部分标记为"text" + result.append(('text', last_end, start - 1)) + # 区间内标记为"list" + result.append(('list', start, end)) + last_end = end + 1 + + if last_end < slen: + # 如果最后一个区间结束后还有剩余的字符串,将其标记为"text" + result.append(('text', last_end, slen - 1)) + + return result + + """====================""" + + if lang != 'en': + return lines, None + else: + total_lines = len(lines) + line_fea_encode = [] + """ + 对每一行进行特征编码,编码规则如下: + 1. 如果行顶格,且大写字母开头或者数字开头,编码为1 + 2. 如果顶格,其他非大写开头编码为4 + 3. 如果非顶格,首字符大写,编码为2 + 4. 如果非顶格,首字符非大写编码为3 + """ + if len(lines) > 0: + x_map_tag_dict, min_x_tag = cluster_line_x(lines) + for l in lines: + span_text = __get_span_text(l['spans'][0]) + first_char = span_text[0] + layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes) + if not layout: + line_fea_encode.append(0) + else: + # + if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag: + # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum(): + if not first_char.isalnum() or if_match_reference_list(span_text): + line_fea_encode.append(1) + else: + line_fea_encode.append(4) + else: + if first_char.isupper(): + line_fea_encode.append(2) + else: + line_fea_encode.append(3) + + # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。 + + list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode) + if len(list_indice) > 0: + if debug_able: + logger.info(f"发现了列表,列表行数:{list_indice}, {list_start_idx}") + + # TODO check一下这个特列表里缩进的行左侧是不是对齐的。 + segments = [] + for start, end in list_indice: + for i in range(start, end + 1): + if i > 0: + if line_fea_encode[i] == 4: + if debug_able: + logger.info(f"列表行的第{i}行不是顶格的") + break + else: + if debug_able: + logger.info(f"列表行的第{start}到第{end}行是列表") + + return split_indices(total_lines, list_indice), list_start_idx + + +def cluster_line_x(lines: list) -> dict: + """ + 对一个block内所有lines的bbox的x0聚类 + """ + min_distance = 5 + min_sample = 1 + x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines]) + x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst) + x0_uniq_label = np.unique(x0_clusters.labels_) + #x1_lst = np.array([[line['bbox'][2], 0] for line in lines]) + x0_2_new_val = {} # 存储旧值对应的新值映射 + min_x0 = round(lines[0]["bbox"][0]) + for label in x0_uniq_label: + if label == -1: + continue + x0_index_of_label = np.where(x0_clusters.labels_ == label) + x0_raw_val = x0_lst[x0_index_of_label][:, 0] + x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0]) + x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val}) + if x0_new_val < min_x0: + min_x0 = x0_new_val + return x0_2_new_val, min_x0 + + +def if_match_reference_list(text: str) -> bool: + pattern = re.compile(r'^\d+\..*') + if pattern.match(text): + return True + else: + return False + + +def __valign_lines(blocks, layout_bboxes): + """ + 在一个layoutbox内对齐行的左侧和右侧。 + 扫描行的左侧和右侧,如果x0, x1差距不超过一个阈值,就强行对齐到所处layout的左右两侧(和layout有一段距离)。 + 3是个经验值,TODO,计算得来,可以设置为1.5个正文字符。 + """ + + min_distance = 3 + min_sample = 2 + new_layout_bboxes = [] + + for layout_box in layout_bboxes: + blocks_in_layoutbox = [b for b in blocks if + b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])] + if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0: + new_layout_bboxes.append(layout_box['layout_bbox']) + continue + + x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']]) + x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']]) + x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst) + x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst) + x0_uniq_label = np.unique(x0_clusters.labels_) + x1_uniq_label = np.unique(x1_clusters.labels_) + + x0_2_new_val = {} # 存储旧值对应的新值映射 + x1_2_new_val = {} + for label in x0_uniq_label: + if label == -1: + continue + x0_index_of_label = np.where(x0_clusters.labels_ == label) + x0_raw_val = x0_lst[x0_index_of_label][:, 0] + x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0]) + x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val}) + for label in x1_uniq_label: + if label == -1: + continue + x1_index_of_label = np.where(x1_clusters.labels_ == label) + x1_raw_val = x1_lst[x1_index_of_label][:, 0] + x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0]) + x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val}) + + for block in blocks_in_layoutbox: + for line in block['lines']: + x0, x1 = line['bbox'][0], line['bbox'][2] + if x0 in x0_2_new_val: + line['bbox'][0] = int(x0_2_new_val[x0]) + + if x1 in x1_2_new_val: + line['bbox'][2] = int(x1_2_new_val[x1]) + # 其余对不齐的保持不动 + + # 由于修改了block里的line长度,现在需要重新计算block的bbox + for block in blocks_in_layoutbox: + if len(block["lines"]) > 0: + block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), + min([line['bbox'][1] for line in block['lines']]), + max([line['bbox'][2] for line in block['lines']]), + max([line['bbox'][3] for line in block['lines']])] + + """新计算layout的bbox,因为block的bbox变了。""" + layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox]) + layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox]) + layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox]) + layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox]) + new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1]) + + return new_layout_bboxes + + +def __align_text_in_layout(blocks, layout_bboxes): + """ + 由于ocr出来的line,有时候会在前后有一段空白,这个时候需要对文本进行对齐,超出的部分被layout左右侧截断。 + """ + for layout in layout_bboxes: + lb = layout['layout_bbox'] + blocks_in_layoutbox = [block for block in blocks if + block["type"] == BlockType.Text and is_in_layout(block['bbox'], lb)] + if len(blocks_in_layoutbox) == 0: + continue + + for block in blocks_in_layoutbox: + for line in block.get("lines", []): + x0, x1 = line['bbox'][0], line['bbox'][2] + if x0 < lb[0]: + line['bbox'][0] = lb[0] + if x1 > lb[2]: + line['bbox'][2] = lb[2] + + +def __common_pre_proc(blocks, layout_bboxes): + """ + 不分语言的,对文本进行预处理 + """ + # __add_line_period(blocks, layout_bboxes) + __align_text_in_layout(blocks, layout_bboxes) + aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes) + + return aligned_layout_bboxes + + +def __pre_proc_zh_blocks(blocks, layout_bboxes): + """ + 对中文文本进行分段预处理 + """ + pass + + +def __pre_proc_en_blocks(blocks, layout_bboxes): + """ + 对英文文本进行分段预处理 + """ + pass + + +def __group_line_by_layout(blocks, layout_bboxes): + """ + 每个layout内的行进行聚合 + """ + # 因为只是一个block一行目前, 一个block就是一个段落 + blocks_group = [] + for lyout in layout_bboxes: + blocks_in_layout = [block for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox'])] + blocks_group.append(blocks_in_layout) + return blocks_group + + +def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"): + """ + lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。 + 1. 先计算每个group的左右边界。 + 2. 然后根据行末尾特征进行分段。 + 末尾特征:以句号等结束符结尾。并且距离右侧边界有一定距离。 + 且下一行开头不留空白。 + + """ + list_info = [] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾 + for blocks in blocks_group: + is_start_list = None + is_end_list = None + if len(blocks) == 0: + list_info.append([False, False]) + continue + if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text: + list_info.append([False, False]) + continue + if blocks[0]["type"] != BlockType.Text: + is_start_list = False + if blocks[-1]["type"] != BlockType.Text: + is_end_list = False + + lines = [line for block in blocks if + block["type"] == BlockType.Text for line in + block['lines']] + total_lines = len(lines) + if total_lines == 1 or total_lines == 0: + list_info.append([False, False]) + continue + """在进入到真正的分段之前,要对文字块从统计维度进行对齐方式的探测, + 对齐方式分为以下: + 1. 左对齐的文本块(特点是左侧顶格,或者左侧不顶格但是右侧顶格的行数大于非顶格的行数,顶格的首字母有大写也有小写) + 1) 右侧对齐的行,单独成一段 + 2) 中间对齐的行,按照字体/行高聚合成一段 + 2. 左对齐的列表块(其特点是左侧顶格的行数小于等于非顶格的行数,非定格首字母会有小写,顶格90%是大写。并且左侧顶格行数大于1,大于1是为了这种模式连续出现才能称之为列表) + 这样的文本块,顶格的为一个段落开头,紧随其后非顶格的行属于这个段落。 + """ + text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang) + """根据list_range,把lines分成几个部分 + + """ + for list_start in list_start_line: + if len(list_start) > 1: + for i in range(0, len(list_start)): + index = list_start[i] - 1 + if index >= 0: + if "content" in lines[index]["spans"][-1]: + lines[index]["spans"][-1]["content"] += '\n\n' + layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头,列表结尾 + for content_type, start, end in text_segments: + if content_type == 'list': + if start == 0 and is_start_list is None: + layout_list_info[0] = True + if end == total_lines - 1 and is_end_list is None: + layout_list_info[1] = True + + list_info.append(layout_list_info) + return list_info + + +def __split_para_lines(lines: list, text_blocks: list) -> list: + text_paras = [] + other_paras = [] + text_lines = [] + for line in lines: + + spans_types = [span["type"] for span in line] + if ContentType.Table in spans_types: + other_paras.append([line]) + continue + if ContentType.Image in spans_types: + other_paras.append([line]) + continue + if ContentType.InterlineEquation in spans_types: + other_paras.append([line]) + continue + text_lines.append(line) + + for block in text_blocks: + block_bbox = block["bbox"] + para = [] + for line in text_lines: + bbox = line["bbox"] + if is_in_layout(bbox, block_bbox): + para.append(line) + if len(para) > 0: + text_paras.append(para) + paras = other_paras.extend(text_paras) + paras_sorted = sorted(paras, key=lambda x: x[0]["bbox"][1]) + return paras_sorted + + +def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, page_num, lang): + global debug_able + """ + 如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。 + 根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。 + """ + if len(blocks_group) == 0 or len(blocks_group) == 0: # 0的时候最后的return 会出错 + return blocks_group, [False, False] + + for i in range(1, len(blocks_group)): + if len(blocks_group[i]) == 0 or len(blocks_group[i - 1]) == 0: + continue + pre_layout_list_info = layout_list_info[i - 1] + next_layout_list_info = layout_list_info[i] + pre_last_para = blocks_group[i - 1][-1].get("lines", []) + next_paras = blocks_group[i] + next_first_para = next_paras[0] + + if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[ + "type"] == BlockType.Text: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 + if debug_able: + logger.info(f"连接page {page_num} 内的list") + # 向layout_paras[i] 寻找开头具有相同缩进的连续的行 + may_list_lines = [] + lines = next_first_para.get("lines", []) + + for line in lines: + if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]: + may_list_lines.append(line) + else: + break + # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。 + if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1: + pre_last_para.extend(may_list_lines) + next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):] + + return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息 + + +def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, + pre_page_list_info, next_page_list_info, page_num, lang): + """ + 如果上个layout的最后一个段落是列表,下一个layout的第一个段落也是列表,那么将他们连接起来。 TODO 因为没有区分列表和段落,所以这个方法暂时不实现。 + 根据layout_list_info判断是不是列表。,下个layout的第一个段如果不是列表,那么看他们是否有几行都有相同的缩进。 + """ + if len(pre_page_paras) == 0 or len(next_page_paras) == 0: # 0的时候最后的return 会出错 + return False + if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0: + return False + if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text: + return False + if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进 + if debug_able: + logger.info(f"连接page {page_num} 内的list") + # 向layout_paras[i] 寻找开头具有相同缩进的连续的行 + may_list_lines = [] + next_page_first_para = next_page_paras[0][0] + if next_page_first_para["type"] == BlockType.Text: + lines = next_page_first_para["lines"] + for line in lines: + if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]: + may_list_lines.append(line) + else: + break + # 如果这些行的缩进是相等的,那么连到上一个layout的最后一个段落上。 + if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1: + #pre_page_paras[-1].append(may_list_lines) + # 下一页合并到上一页最后一段,打一个cross_page的标签 + for line in may_list_lines: + for span in line["spans"]: + span[CROSS_PAGE] = True + pre_page_paras[-1][-1]["lines"].extend(may_list_lines) + next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):] + return True + + return False + + +def __find_layout_bbox_by_line(line_bbox, layout_bboxes): + """ + 根据line找到所在的layout + """ + for layout in layout_bboxes: + if is_in_layout(line_bbox, layout): + return layout + return None + + +def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox): + """ + layout之间进行分段。 + 主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。 + 连接的条件需要同时满足: + 1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。 + 2. 下一行开头不留空白。 + + """ + connected_layout_blocks = [] + if len(blocks_group) == 0: + return connected_layout_blocks + + connected_layout_blocks.append(blocks_group[0]) + for i in range(1, len(blocks_group)): + try: + if len(blocks_group[i]) == 0: + continue + if len(blocks_group[i - 1]) == 0: # TODO 考虑连接问题, + connected_layout_blocks.append(blocks_group[i]) + continue + # text类型的段才需要考虑layout间的合并 + if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text: + connected_layout_blocks.append(blocks_group[i]) + continue + if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0: + connected_layout_blocks.append(blocks_group[i]) + continue + pre_last_line = blocks_group[i - 1][-1]["lines"][-1] + next_first_line = blocks_group[i][0]["lines"][0] + except Exception as e: + logger.error(f"page layout {i} has no line") + continue + pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) + pre_last_line_type = pre_last_line['spans'][-1]['type'] + next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']]) + next_first_line_type = next_first_line['spans'][0]['type'] + if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: + #connected_layout_paras.append(layout_paras[i]) + connected_layout_blocks.append(blocks_group[i]) + continue + pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox) + next_layout = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox) + + pre_x2_max = pre_layout[2] if pre_layout else -1 + next_x0_min = next_layout[0] if next_layout else -1 + + pre_last_line_text = pre_last_line_text.strip() + next_first_line_text = next_first_line_text.strip() + if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[ + -1] not in LINE_STOP_FLAG and \ + next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。 + """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" + #connected_layout_paras[-1][-1].extend(layout_paras[i][0]) + connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"]) + #layout_paras[i].pop(0) # 删除后一个layout的第一个段落, 因为他已经被合并到前一个layout的最后一个段落了。 + blocks_group[i][0]["lines"] = [] #删除后一个layout第一个段落中的lines,因为他已经被合并到前一个layout的最后一个段落了 + blocks_group[i][0][LINES_DELETED] = True + # if len(layout_paras[i]) == 0: + # layout_paras.pop(i) + # else: + # connected_layout_paras.append(layout_paras[i]) + connected_layout_blocks.append(blocks_group[i]) + else: + """连接段落条件不成立,将前一个layout的段落加入到结果中。""" + #connected_layout_paras.append(layout_paras[i]) + connected_layout_blocks.append(blocks_group[i]) + return connected_layout_blocks + + +def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, + lang): + """ + 连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。 + 是否可以连接的条件: + 1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。 + 2. 后一个页面的第一个段落第一行没有空白开头。 + """ + # 有的页面可能压根没有文字 + if len(pre_page_paras) == 0 or len(next_page_paras) == 0 or len(pre_page_paras[0]) == 0 or len( + next_page_paras[0]) == 0: # TODO [[]]为什么出现在pre_page_paras里? + return False + pre_last_block = pre_page_paras[-1][-1] + next_first_block = next_page_paras[0][0] + if pre_last_block["type"] != BlockType.Text or next_first_block["type"] != BlockType.Text: + return False + if len(pre_last_block["lines"]) == 0 or len(next_first_block["lines"]) == 0: + return False + pre_last_para = pre_last_block["lines"] + next_first_para = next_first_block["lines"] + pre_last_line = pre_last_para[-1] + next_first_line = next_first_para[0] + pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']]) + pre_last_line_type = pre_last_line['spans'][-1]['type'] + next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']]) + next_first_line_type = next_first_line['spans'][0]['type'] + + if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, + INLINE_EQUATION]: # TODO,真的要做好,要考虑跨table, image, 行间的情况 + # 不是文本,不连接 + return False + + pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox) + if not pre_x2_max_bbox: + return False + next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox) + if not next_x0_min_bbox: + return False + + pre_x2_max = pre_x2_max_bbox[2] + next_x0_min = next_x0_min_bbox[0] + + pre_last_line_text = pre_last_line_text.strip() + next_first_line_text = next_first_line_text.strip() + if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \ + next_first_line['bbox'][0] == next_x0_min: # 前面一行沾满了整个行,并且没有结尾符号.下一行没有空白开头。 + """连接段落条件成立,将前一个layout的段落和后一个layout的段落连接。""" + # 下一页合并到上一页最后一段,打一个cross_page的标签 + for line in next_first_para: + for span in line["spans"]: + span[CROSS_PAGE] = True + pre_last_para.extend(next_first_para) + + #next_page_paras[0].pop(0) # 删除后一个页面的第一个段落, 因为他已经被合并到前一个页面的最后一个段落了。 + next_page_paras[0][0]["lines"] = [] + next_page_paras[0][0][LINES_DELETED] = True + return True + else: + return False + + +def find_consecutive_true_regions(input_array): + start_index = None # 连续True区域的起始索引 + regions = [] # 用于保存所有连续True区域的起始和结束索引 + + for i in range(len(input_array)): + # 如果我们找到了一个True值,并且当前并没有在连续True区域中 + if input_array[i] and start_index is None: + start_index = i # 记录连续True区域的起始索引 + + # 如果我们找到了一个False值,并且当前在连续True区域中 + elif not input_array[i] and start_index is not None: + # 如果连续True区域长度大于1,那么将其添加到结果列表中 + if i - start_index > 1: + regions.append((start_index, i - 1)) + start_index = None # 重置起始索引 + + # 如果最后一个元素是True,那么需要将最后一个连续True区域加入到结果列表中 + if start_index is not None and len(input_array) - start_index > 1: + regions.append((start_index, len(input_array) - 1)) + + return regions + + +def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang): + global debug_able + """ + 找出来中间对齐的连续单行文本,如果连续行高度相同,那么合并为一个段落。 + 一个line居中的条件是: + 1. 水平中心点跨越layout的中心点。 + 2. 左右两侧都有空白 + """ + + for layout_i, layout_para in enumerate(page_paras): + layout_box = new_layout_bbox[layout_i] + single_line_paras_tag = [] + for i in range(len(layout_para)): + #single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT) + single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1) + """找出来连续的单行文本,如果连续行高度相同,那么合并为一个段落。""" + consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag) + if len(consecutive_single_line_indices) > 0: + #index_offset = 0 + """检查这些行是否是高度相同的,居中的""" + for start, end in consecutive_single_line_indices: + #start += index_offset + #end += index_offset + line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in + layout_para[start:end + 1]]) + first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']]) + if "Table" in first_line_text or "Figure" in first_line_text: + pass + if debug_able: + logger.info(line_hi.std()) + + if line_hi.std() < 2: + """行高度相同,那么判断是否居中""" + all_left_x0 = [block["lines"][0]['bbox'][0] for block in layout_para[start:end + 1]] + all_right_x1 = [block["lines"][0]['bbox'][2] for block in layout_para[start:end + 1]] + layout_center = (layout_box[0] + layout_box[2]) / 2 + if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \ + and not all([x0 == layout_box[0] for x0 in all_left_x0]) \ + and not all([x1 == layout_box[2] for x1 in all_right_x1]): + merge_para = [block["lines"][0] for block in layout_para[start:end + 1]] + para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']]) + if debug_able: + logger.info(para_text) + layout_para[start]["lines"] = merge_para + for i_para in range(start + 1, end + 1): + layout_para[i_para]["lines"] = [] + layout_para[i_para][LINES_DELETED] = True + #layout_para[start:end + 1] = [merge_para] + + #index_offset -= end - start + + return + + +def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang): + """ + 找出来连续的单行文本,如果首行顶格,接下来的几个单行段落缩进对齐,那么合并为一个段落。 + """ + + pass + + +def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang): + """ + 根据line和layout情况进行分段 + 先实现一个根据行末尾特征分段的简单方法。 + """ + """ + 算法思路: + 1. 扫描layout里每一行,找出来行尾距离layout有边界有一定距离的行。 + 2. 从上述行中找到末尾是句号等可作为断行标志的行。 + 3. 参照上述行尾特征进行分段。 + 4. 图、表,目前独占一行,不考虑分段。 + """ + blocks_group = __group_line_by_layout(blocks, layout_bboxes) # block内分段 + layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang) # layout内分段 + blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, + page_num, lang) # layout之间连接列表段落 + connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox) # layout间链接段落 + + return connected_layout_blocks, page_list_info + + +def para_split(pdf_info_dict, debug_mode, lang="en"): + global debug_able + debug_able = debug_mode + new_layout_of_pages = [] # 数组的数组,每个元素是一个页面的layoutS + all_page_list_info = [] # 保存每个页面开头和结尾是否是列表 + for page_num, page in pdf_info_dict.items(): + blocks = page['preproc_blocks'] + layout_bboxes = page['layout_bboxes'] + new_layout_bbox = __common_pre_proc(blocks, layout_bboxes) + new_layout_of_pages.append(new_layout_bbox) + splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang) + all_page_list_info.append(page_list_info) + page['para_blocks'] = splited_blocks + + """连接页面与页面之间的可能合并的段落""" + pdf_infos = list(pdf_info_dict.values()) + for page_num, page in enumerate(pdf_info_dict.values()): + if page_num == 0: + continue + pre_page_paras = pdf_infos[page_num - 1]['para_blocks'] + next_page_paras = pdf_infos[page_num]['para_blocks'] + pre_page_layout_bbox = new_layout_of_pages[page_num - 1] + next_page_layout_bbox = new_layout_of_pages[page_num] + + is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, + next_page_layout_bbox, page_num, lang) + if debug_able: + if is_conn: + logger.info(f"连接了第{page_num - 1}页和第{page_num}页的段落") + + is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, + next_page_layout_bbox, all_page_list_info[page_num - 1], + all_page_list_info[page_num], page_num, lang) + if debug_able: + if is_list_conn: + logger.info(f"连接了第{page_num - 1}页和第{page_num}页的列表段落") + + """接下来可能会漏掉一些特别的一些可以合并的内容,对他们进行段落连接 + 1. 正文中有时出现一个行顶格,接下来几行缩进的情况。 + 2. 居中的一些连续单行,如果高度相同,那么可能是一个段落。 + """ + for page_num, page in enumerate(pdf_info_dict.values()): + page_paras = page['para_blocks'] + new_layout_bbox = new_layout_of_pages[page_num] + __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang) + __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang) + + # layout展平 + for page_num, page in enumerate(pdf_info_dict.values()): + page_paras = page['para_blocks'] + page_blocks = [block for layout in page_paras for block in layout] + page["para_blocks"] = page_blocks diff --git a/magic_pdf/para/raw_processor.py b/magic_pdf/para/raw_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..edbf9964b88159d898555711571703e74fc56180 --- /dev/null +++ b/magic_pdf/para/raw_processor.py @@ -0,0 +1,207 @@ +class RawBlockProcessor: + def __init__(self) -> None: + self.y_tolerance = 2 + self.pdf_dic = {} + + def __span_flags_decomposer(self, span_flags): + """ + Make font flags human readable. + + Parameters + ---------- + self : object + The instance of the class. + + span_flags : int + span flags + + Returns + ------- + l : dict + decomposed flags + """ + + l = { + "is_superscript": False, + "is_italic": False, + "is_serifed": False, + "is_sans_serifed": False, + "is_monospaced": False, + "is_proportional": False, + "is_bold": False, + } + + if span_flags & 2**0: + l["is_superscript"] = True # 表示上标 + + if span_flags & 2**1: + l["is_italic"] = True # 表示斜体 + + if span_flags & 2**2: + l["is_serifed"] = True # 表示衬线字体 + else: + l["is_sans_serifed"] = True # 表示非衬线字体 + + if span_flags & 2**3: + l["is_monospaced"] = True # 表示等宽字体 + else: + l["is_proportional"] = True # 表示比例字体 + + if span_flags & 2**4: + l["is_bold"] = True # 表示粗体 + + return l + + def __make_new_lines(self, raw_lines): + """ + This function makes new lines. + + Parameters + ---------- + self : object + The instance of the class. + + raw_lines : list + raw lines + + Returns + ------- + new_lines : list + new lines + """ + new_lines = [] + new_line = None + + for raw_line in raw_lines: + raw_line_bbox = raw_line["bbox"] + raw_line_spans = raw_line["spans"] + raw_line_text = "".join([span["text"] for span in raw_line_spans]) + raw_line_dir = raw_line.get("dir", None) + + decomposed_line_spans = [] + for span in raw_line_spans: + raw_flags = span["flags"] + decomposed_flags = self.__span_flags_decomposer(raw_flags) + span["decomposed_flags"] = decomposed_flags + decomposed_line_spans.append(span) + + if new_line is None: + new_line = { + "bbox": raw_line_bbox, + "text": raw_line_text, + "dir": raw_line_dir if raw_line_dir else (0, 0), + "spans": decomposed_line_spans, + } + else: + if ( + abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance + and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance + ): + new_line["bbox"] = ( + min(new_line["bbox"][0], raw_line_bbox[0]), # left + new_line["bbox"][1], # top + max(new_line["bbox"][2], raw_line_bbox[2]), # right + raw_line_bbox[3], # bottom + ) + new_line["text"] += " " + raw_line_text + new_line["spans"].extend(raw_line_spans) + new_line["dir"] = ( + new_line["dir"][0] + raw_line_dir[0], + new_line["dir"][1] + raw_line_dir[1], + ) + else: + new_lines.append(new_line) + new_line = { + "bbox": raw_line_bbox, + "text": raw_line_text, + "dir": raw_line_dir if raw_line_dir else (0, 0), + "spans": raw_line_spans, + } + if new_line: + new_lines.append(new_line) + + return new_lines + + def __make_new_block(self, raw_block): + """ + This function makes a new block. + + Parameters + ---------- + self : object + The instance of the class. + ---------- + raw_block : dict + a raw block + + Returns + ------- + new_block : dict + + Schema of new_block: + { + "block_id": "block_1", + "bbox": [0, 0, 100, 100], + "text": "This is a block.", + "lines": [ + { + "bbox": [0, 0, 100, 100], + "text": "This is a line.", + "spans": [ + { + "text": "This is a span.", + "font": "Times New Roman", + "size": 12, + "color": "#000000", + } + ], + } + ], + } + """ + new_block = {} + + block_id = raw_block["number"] + block_bbox = raw_block["bbox"] + block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"]) + raw_lines = raw_block["lines"] + block_lines = self.__make_new_lines(raw_lines) + + new_block["block_id"] = block_id + new_block["bbox"] = block_bbox + new_block["text"] = block_text + new_block["lines"] = block_lines + + return new_block + + def batch_process_blocks(self, pdf_dic): + """ + This function processes the blocks in batch. + + Parameters + ---------- + self : object + The instance of the class. + ---------- + blocks : list + Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json. + + Returns + ------- + result_dict : dict + result dictionary + """ + + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "preproc_blocks" in blocks.keys(): + input_blocks = blocks["preproc_blocks"] + for raw_block in input_blocks: + new_block = self.__make_new_block(raw_block) + para_blocks.append(new_block) + + blocks["para_blocks"] = para_blocks + + return pdf_dic + diff --git a/magic_pdf/para/stats.py b/magic_pdf/para/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..fd509b952b8df09d7dcaba4a561a0c9f8caced78 --- /dev/null +++ b/magic_pdf/para/stats.py @@ -0,0 +1,268 @@ +from collections import Counter +import numpy as np + +from magic_pdf.para.commons import * + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class BlockStatisticsCalculator: + def __init__(self) -> None: + pass + + def __calc_stats_of_new_lines(self, new_lines): + """ + This function calculates the paragraph metrics + + Parameters + ---------- + combined_lines : list + combined lines + + Returns + ------- + X0 : float + Median of x0 values, which represents the left average boundary of the block + X1 : float + Median of x1 values, which represents the right average boundary of the block + avg_char_width : float + Average of char widths, which represents the average char width of the block + avg_char_height : float + Average of line heights, which represents the average line height of the block + + """ + x0_values = [] + x1_values = [] + char_widths = [] + char_heights = [] + + block_font_types = [] + block_font_sizes = [] + block_directions = [] + + if len(new_lines) > 0: + for i, line in enumerate(new_lines): + line_bbox = line["bbox"] + line_text = line["text"] + line_spans = line["spans"] + + num_chars = len([ch for ch in line_text if not ch.isspace()]) + + x0_values.append(line_bbox[0]) + x1_values.append(line_bbox[2]) + + if num_chars > 0: + char_width = (line_bbox[2] - line_bbox[0]) / num_chars + char_widths.append(char_width) + + for span in line_spans: + block_font_types.append(span["font"]) + block_font_sizes.append(span["size"]) + + if "dir" in line: + block_directions.append(line["dir"]) + + # line_font_types = [span["font"] for span in line_spans] + char_heights = [span["size"] for span in line_spans] + + X0 = np.median(x0_values) if x0_values else 0 + X1 = np.median(x1_values) if x1_values else 0 + avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0 + avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0 + + # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None + + max_span_length = 0 + max_span_font_type = None + for line in new_lines: + line_spans = line["spans"] + for span in line_spans: + span_length = span["bbox"][2] - span["bbox"][0] + if span_length > max_span_length: + max_span_length = span_length + max_span_font_type = span["font"] + + max_freq_font_type = max_span_font_type + + avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None + + avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0 + avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0 + + median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None + + return ( + X0, + X1, + avg_char_width, + avg_char_height, + max_freq_font_type, + avg_font_size, + (avg_dir_horizontal, avg_dir_vertical), + median_font_size, + ) + + def __make_new_block(self, input_block): + new_block = {} + + raw_lines = input_block["lines"] + stats = self.__calc_stats_of_new_lines(raw_lines) + + block_id = input_block["block_id"] + block_bbox = input_block["bbox"] + block_text = input_block["text"] + block_lines = raw_lines + block_avg_left_boundary = stats[0] + block_avg_right_boundary = stats[1] + block_avg_char_width = stats[2] + block_avg_char_height = stats[3] + block_font_type = stats[4] + block_font_size = stats[5] + block_direction = stats[6] + block_median_font_size = stats[7] + + new_block["block_id"] = block_id + new_block["bbox"] = block_bbox + new_block["text"] = block_text + new_block["dir"] = block_direction + new_block["X0"] = block_avg_left_boundary + new_block["X1"] = block_avg_right_boundary + new_block["avg_char_width"] = block_avg_char_width + new_block["avg_char_height"] = block_avg_char_height + new_block["block_font_type"] = block_font_type + new_block["block_font_size"] = block_font_size + new_block["lines"] = block_lines + new_block["median_font_size"] = block_median_font_size + + return new_block + + def batch_process_blocks(self, pdf_dic): + """ + This function processes the blocks in batch. + + Parameters + ---------- + self : object + The instance of the class. + ---------- + blocks : list + Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json + + Returns + ------- + result_dict : dict + result dictionary + """ + + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "para_blocks" in blocks.keys(): + input_blocks = blocks["para_blocks"] + for input_block in input_blocks: + new_block = self.__make_new_block(input_block) + para_blocks.append(new_block) + + blocks["para_blocks"] = para_blocks + + return pdf_dic + + +class DocStatisticsCalculator: + def __init__(self) -> None: + pass + + def calc_stats_of_doc(self, pdf_dict): + """ + This function computes the statistics of the document + + Parameters + ---------- + result_dict : dict + result dictionary + + Returns + ------- + statistics : dict + statistics of the document + """ + + total_text_length = 0 + total_num_blocks = 0 + + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "para_blocks" in blocks.keys(): + para_blocks = blocks["para_blocks"] + for para_block in para_blocks: + total_text_length += len(para_block["text"]) + total_num_blocks += 1 + + avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0 + + font_list = [] + + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "para_blocks" in blocks.keys(): + input_blocks = blocks["para_blocks"] + for input_block in input_blocks: + block_text_length = len(input_block.get("text", "")) + if block_text_length < avg_text_length * 0.5: + continue + block_font_type = safe_get(input_block, "block_font_type", "") + block_font_size = safe_get(input_block, "block_font_size", 0) + font_list.append((block_font_type, block_font_size)) + + font_counter = Counter(font_list) + most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0) + second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0) + + statistics = { + "num_pages": 0, + "num_blocks": 0, + "num_paras": 0, + "num_titles": 0, + "num_header_blocks": 0, + "num_footer_blocks": 0, + "num_watermark_blocks": 0, + "num_vertical_margin_note_blocks": 0, + "most_common_font_type": most_common_font[0][0], + "most_common_font_size": most_common_font[0][1], + "number_of_most_common_font": most_common_font[1], + "second_most_common_font_type": second_most_common_font[0][0], + "second_most_common_font_size": second_most_common_font[0][1], + "number_of_second_most_common_font": second_most_common_font[1], + "avg_text_length": avg_text_length, + } + + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + blocks = pdf_dict[page_id]["para_blocks"] + statistics["num_pages"] += 1 + for block_id, block_data in enumerate(blocks): + statistics["num_blocks"] += 1 + + if "paras" in block_data.keys(): + statistics["num_paras"] += len(block_data["paras"]) + + for line in block_data["lines"]: + if line.get("is_title", 0): + statistics["num_titles"] += 1 + + if block_data.get("is_header", 0): + statistics["num_header_blocks"] += 1 + if block_data.get("is_footer", 0): + statistics["num_footer_blocks"] += 1 + if block_data.get("is_watermark", 0): + statistics["num_watermark_blocks"] += 1 + if block_data.get("is_vertical_margin_note", 0): + statistics["num_vertical_margin_note_blocks"] += 1 + + pdf_dict["statistics"] = statistics + + return pdf_dict + + diff --git a/magic_pdf/para/title_processor.py b/magic_pdf/para/title_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..00d330fcd062b847cd5f8de0d16ea7bbb683837a --- /dev/null +++ b/magic_pdf/para/title_processor.py @@ -0,0 +1,1014 @@ +import os +import re +import numpy as np + +from magic_pdf.libs.nlp_utils import NLPModels + +from magic_pdf.para.commons import * + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +class TitleProcessor: + def __init__(self, *doc_statistics) -> None: + if len(doc_statistics) > 0: + self.doc_statistics = doc_statistics[0] + + self.nlp_model = NLPModels() + self.MAX_TITLE_LEVEL = 3 + self.numbered_title_pattern = r""" + ^ # 行首 + ( # 开始捕获组 + [\(\(]\d+[\)\)] # 括号内数字,支持中文和英文括号,例如:(1) 或 (1) + |\d+[\)\)]\s # 数字后跟右括号和空格,支持中文和英文括号,例如:2) 或 2) + |[\(\(][A-Z][\)\)] # 括号内大写字母,支持中文和英文括号,例如:(A) 或 (A) + |[A-Z][\)\)]\s # 大写字母后跟右括号和空格,例如:A) 或 A) + |[\(\(][IVXLCDM]+[\)\)] # 括号内罗马数字,支持中文和英文括号,例如:(I) 或 (I) + |[IVXLCDM]+[\)\)]\s # 罗马数字后跟右括号和空格,例如:I) 或 I) + |\d+(\.\d+)*\s # 数字或复合数字编号后跟空格,例如:1. 或 3.2.1 + |[一二三四五六七八九十百千]+[、\s] # 中文序号后跟顿号和空格,例如:一、 + |[\(|\(][一二三四五六七八九十百千]+[\)|\)]\s* # 中文括号内中文序号后跟空格,例如:(一) + |[A-Z]\.\d+(\.\d+)?\s # 大写字母后跟点和数字,例如:A.1 或 A.1.1 + |[\(\(][a-z][\)\)] # 括号内小写字母,支持中文和英文括号,例如:(a) 或 (a) + |[a-z]\)\s # 小写字母后跟右括号和空格,例如:a) + |[A-Z]-\s # 大写字母后跟短横线和空格,例如:A- + |\w+:\s # 英文序号词后跟冒号和空格,例如:First: + |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格 + |[IVXLCDM]+\. # 罗马数字后跟点,例如:I. + |\d+\.\s # 单个数字后跟点和空格,例如:1. + ) # 结束捕获组 + .+ # 标题的其余部分 + """ + + def _is_potential_title( + self, + curr_line, + prev_line, + prev_line_is_title, + next_line, + avg_char_width, + avg_char_height, + median_font_size, + ): + """ + This function checks if the line is a potential title. + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + next_line : dict + next line + avg_char_width : float + average of char widths + avg_char_height : float + average of line heights + + Returns + ------- + bool + True if the line is a potential title, False otherwise. + """ + + def __is_line_centered(line_bbox, page_bbox, avg_char_width): + """ + This function checks if the line is centered on the page + + Parameters + ---------- + line_bbox : list + bbox of the line + page_bbox : list + bbox of the page + avg_char_width : float + average of char widths + + Returns + ------- + bool + True if the line is centered on the page, False otherwise. + """ + horizontal_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + + x0, _, x1, _ = line_bbox + _, _, page_x1, _ = page_bbox + + return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres + + def __is_bold_font_line(line): + """ + Check if a line contains any bold font style. + """ + + def _is_bold_span(span): + # if span text is empty or only contains space, return False + if not span["text"].strip(): + return False + + return bool(span["flags"] & 2**4) # Check if the font is bold + + for span in line["spans"]: + if not _is_bold_span(span): + return False + + return True + + def __is_italic_font_line(line): + """ + Check if a line contains any italic font style. + """ + + def __is_italic_span(span): + return bool(span["flags"] & 2**1) # Check if the font is italic + + for span in line["spans"]: + if not __is_italic_span(span): + return False + + return True + + def __is_punctuation_heavy(line_text): + """ + Check if the line contains a high ratio of punctuation marks, which may indicate + that the line is not a title. + + Parameters: + line_text (str): Text of the line. + + Returns: + bool: True if the line is heavy with punctuation, False otherwise. + """ + # Pattern for common title format like "X.Y. Title" + pattern = r"\b\d+\.\d+\..*\b" + + # If the line matches the title format, return False + if re.match(pattern, line_text.strip()): + return False + + # Find all punctuation marks in the line + punctuation_marks = re.findall(r"[^\w\s]", line_text) + number_of_punctuation_marks = len(punctuation_marks) + + text_length = len(line_text) + + if text_length == 0: + return False + + punctuation_ratio = number_of_punctuation_marks / text_length + if punctuation_ratio >= 0.1: + return True + + return False + + def __has_mixed_font_styles(spans, strict_mode=False): + """ + This function checks if the line has mixed font styles, the strict mode will compare the font types + + Parameters + ---------- + spans : list + spans of the line + strict_mode : bool + True for strict mode, the font types will be fully compared + False for non-strict mode, the font types will be compared by the most longest common prefix + + Returns + ------- + bool + True if the line has mixed font styles, False otherwise. + """ + if strict_mode: + font_styles = set() + for span in spans: + font_style = span["font"].lower() + font_styles.add(font_style) + + return len(font_styles) > 1 + + else: # non-strict mode + font_styles = [] + for span in spans: + font_style = span["font"].lower() + font_styles.append(font_style) + + if len(font_styles) > 1: + longest_common_prefix = os.path.commonprefix(font_styles) + if len(longest_common_prefix) > 0: + return False + else: + return True + else: + return False + + def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type): + """ + This function checks if the current line has a different font type from the previous and next lines + + Parameters + ---------- + curr_line_font_type : str + font type of the current line + prev_line_font_type : str + font type of the previous line + next_line_font_type : str + font type of the next line + + Returns + ------- + bool + True if the current line has a different font type from the previous and next lines, False otherwise. + """ + return all( + curr_line_font_type != other_font_type.lower() + for other_font_type in [prev_line_font_type, next_line_font_type] + if other_font_type is not None + ) + + def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size): + """ + This function checks if the current line has a larger font size than the previous and next lines + + Parameters + ---------- + curr_line_font_size : float + font size of the current line + prev_line_font_size : float + font size of the previous line + next_line_font_size : float + font size of the next line + + Returns + ------- + bool + True if the current line has a larger font size than the previous and next lines, False otherwise. + """ + return all( + curr_line_font_size > other_font_size * 1.2 + for other_font_size in [prev_line_font_size, next_line_font_size] + if other_font_size is not None + ) + + def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size): + """ + This function checks if the current line is similar to the previous line + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + + Returns + ------- + bool + True if the current line is similar to the previous line, False otherwise. + """ + + if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size: + return True + else: + return False + + def __is_same_font_type_of_docAvg(curr_line_font_type): + """ + This function checks if the current line has the same font type as the document average font type + + Parameters + ---------- + curr_line_font_type : str + font type of the current line + + Returns + ------- + bool + True if the current line has the same font type as the document average font type, False otherwise. + """ + doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower() + doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower() + + return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type] + + def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1): + """ + This function checks if the current line has a large enough font size + + Parameters + ---------- + curr_line_font_size : float + font size of the current line + ratio : float + ratio of the current line font size to the document average font size + + Returns + ------- + bool + True if the current line has a large enough font size, False otherwise. + """ + doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0) + doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0) + doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size) + + return curr_line_font_size >= doc_avg_font_size * ratio + + def __is_sufficient_spacing_above_and_below( + curr_line_bbox, + prev_line_bbox, + next_line_bbox, + avg_char_height, + median_font_size, + ): + """ + This function checks if the current line has sufficient spacing above and below + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + avg_char_height : float + average of line heights + + Returns + ------- + bool + True if the current line has sufficient spacing above and below, False otherwise. + """ + vertical_ratio = 1.25 + vertical_thres = vertical_ratio * median_font_size + + _, y0, _, y1 = curr_line_bbox + + sufficient_spacing_above = False + if prev_line_bbox: + vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3]) + sufficient_spacing_above = vertical_spacing_above > vertical_thres + else: + sufficient_spacing_above = True + + sufficient_spacing_below = False + if next_line_bbox: + vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1) + sufficient_spacing_below = vertical_spacing_below > vertical_thres + else: + sufficient_spacing_below = True + + return (sufficient_spacing_above, sufficient_spacing_below) + + def __is_word_list_line_by_rules(curr_line_text): + """ + This function checks if the current line is a word list + + Parameters + ---------- + curr_line_text : str + text of the current line + + Returns + ------- + bool + True if the current line is a name list, False otherwise. + """ + # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[,,;;\s]|$)" + name_list_pattern = r"(?= 0.9: + return True + + return False + + def __is_equation(line_text): + """ + This function checks if the current line is an equation. + + Parameters + ---------- + line_text : str + + Returns + ------- + bool + True if the current line is an equation, False otherwise. + """ + equation_reg = r"\$.*?\\overline.*?\$" # to match interline equations + + if re.search(equation_reg, line_text): + return True + else: + return False + + def __is_title_by_len(text, max_length=200): + """ + This function checks if the current line is a title by length. + + Parameters + ---------- + text : str + text of the current line + + max_length : int + max length of the title + + Returns + ------- + bool + True if the current line is a title, False otherwise. + + """ + text = text.strip() + return len(text) <= max_length + + def __compute_line_font_type_and_size(curr_line): + """ + This function computes the font type and font size of the line. + + Parameters + ---------- + line : dict + line + + Returns + ------- + font_type : str + font type of the line + font_size : float + font size of the line + """ + spans = curr_line["spans"] + max_accumulated_length = 0 + max_span_font_size = curr_line["spans"][0]["size"] # default value, float type + max_span_font_type = curr_line["spans"][0]["font"].lower() # default value, string type + for span in spans: + if span["text"].isspace(): + continue + span_length = span["bbox"][2] - span["bbox"][0] + if span_length > max_accumulated_length: + max_accumulated_length = span_length + max_span_font_size = span["size"] + max_span_font_type = span["font"].lower() + + return max_span_font_type, max_span_font_size + + """ + Title detecting main Process. + """ + + """ + Basic features about the current line. + """ + curr_line_bbox = curr_line["bbox"] + curr_line_text = curr_line["text"] + curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line) + + if len(curr_line_text.strip()) == 0: # skip empty lines + return False + + prev_line_bbox = prev_line["bbox"] if prev_line else None + if prev_line: + prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line) + else: + prev_line_font_type, prev_line_font_size = None, None + + next_line_bbox = next_line["bbox"] if next_line else None + if next_line: + next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line) + else: + next_line_font_type, next_line_font_size = None, None + + """ + Aggregated features about the current line. + """ + is_italc_font = __is_italic_font_line(curr_line) + is_bold_font = __is_bold_font_line(curr_line) + + is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8) + is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1) + is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6) + + is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type) + + is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg + + is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True) + is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False) + + is_punctuation_heavy = __is_punctuation_heavy(curr_line_text) + + is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text) + is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"] + + is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors( + curr_line_font_size, prev_line_font_size, next_line_font_size + ) + + is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors( + curr_line_font_type, prev_line_font_type, next_line_font_type + ) + + has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below( + curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size + ) + + is_similar_to_pre_line = __is_similar_to_pre_line( + curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size + ) + + """ + Further aggregated features about the current line. + + Attention: + Features that start with __ are for internal use. + """ + + __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors( + curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width + ) + __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors + is_a_left_inline_title = ( + is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors + ) + + is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font + is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font + + is_title_by_check_pre_and_next_line = ( + (prev_line is not None or next_line is not None) + and has_sufficient_spaces_above + and has_sufficient_spaces_below + and is_potential_title_font + ) + + is_numbered_title = __is_numbered_title(curr_line_text) and ( + (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None) + ) + + is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text) + + is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text) + + is_equation = __is_equation(curr_line_text) + + is_title_by_len = __is_title_by_len(curr_line_text) + + """ + Decide if the line is a title. + """ + # is_title = False + # if prev_line_is_title: + + is_title = ( + is_not_end_with_ending_puncs # not end with ending punctuation marks + and is_not_only_no_meaning_symbols # not only have no meaning symbols + and is_title_by_len # is a title by length, default max length is 200 + and not is_equation # an interline equation should never be a title + and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type + and ( + (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg) + or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) + or ( + is_much_larger_font_than_doc_avg + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) + or ( + is_font_size_little_less_than_doc_avg + and is_bold_font + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) + ) # not the same font type as the document average font type, which includes the most common font type and the second most common font type + and ( + ( + not is_person_or_org_list_line_by_nlp + and ( + is_much_larger_font_than_doc_avg + or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg) + ) + ) + or ( + not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp) + and not is_a_left_inline_title + and not is_punctuation_heavy + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) + or ( + is_person_or_org_list_line_by_nlp + and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) + and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) + ) + or (is_numbered_title and not is_a_left_inline_title) + ) + ) + # ) or (is_similar_to_pre_line and prev_line_is_title) + + is_name_or_org_list_to_be_removed = ( + (is_person_or_org_list_line_by_nlp) + and is_punctuation_heavy + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) and not is_title + + if is_name_or_org_list_to_be_removed: + is_author_or_org_list = True + # print curr_line_text to check + # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}") + else: + is_author_or_org_list = False + """ + # print reason why the line is a title + if is_title: + print_green("This line is a title.") + print_green("↓" * 10) + print() + print("curr_line_text: ", curr_line_text) + print() + + # print reason why the line is not a title + line_text = curr_line_text.strip() + test_text = "Career/Personal Life" + text_content_condition = line_text == test_text + + if not is_title and text_content_condition: # Print specific line + # if not is_title: # Print each line + print_red("This line is not a title.") + print_red("↓" * 10) + + print() + print("curr_line_text: ", curr_line_text) + print() + + if is_not_end_with_ending_puncs: + print_green(f"is_not_end_with_ending_puncs") + else: + print_red(f"is_end_with_ending_puncs") + + if is_not_only_no_meaning_symbols: + print_green(f"is_not_only_no_meaning_symbols") + else: + print_red(f"is_only_no_meaning_symbols") + + if is_title_by_len: + print_green(f"is_title_by_len: {is_title_by_len}") + else: + print_red(f"is_not_title_by_len: {is_title_by_len}") + + if is_equation: + print_red(f"is_equation") + else: + print_green(f"is_not_equation") + + if is_potential_title_font: + print_green(f"is_potential_title_font") + else: + print_red(f"is_not_potential_title_font") + + if is_punctuation_heavy: + print_red("is_punctuation_heavy") + else: + print_green("is_not_punctuation_heavy") + + if is_bold_font: + print_green(f"is_bold_font") + else: + print_red(f"is_not_bold_font") + + if is_font_size_not_less_than_doc_avg: + print_green(f"is_larger_font_than_doc_avg") + else: + print_red(f"is_not_larger_font_than_doc_avg") + + if is_much_larger_font_than_doc_avg: + print_green(f"is_much_larger_font_than_doc_avg") + else: + print_red(f"is_not_much_larger_font_than_doc_avg") + + if is_not_same_font_type_of_docAvg: + print_green(f"is_not_same_font_type_of_docAvg") + else: + print_red(f"is_same_font_type_of_docAvg") + + if is_word_list_line_by_rules: + print_red("is_word_list_line_by_rules") + else: + print_green("is_not_name_list_by_rules") + + if is_person_or_org_list_line_by_nlp: + print_red("is_person_or_org_list_line_by_nlp") + else: + print_green("is_not_person_or_org_list_line_by_nlp") + + if not is_numbered_title: + print_red("is_not_numbered_title") + else: + print_green("is_numbered_title") + + if is_a_left_inline_title: + print_red("is_a_left_inline_title") + else: + print_green("is_not_a_left_inline_title") + + if not is_title_by_check_prev_line: + print_red("is_not_title_by_check_prev_line") + else: + print_green("is_title_by_check_prev_line") + + if not is_title_by_check_next_line: + print_red("is_not_title_by_check_next_line") + else: + print_green("is_title_by_check_next_line") + + if not is_title_by_check_pre_and_next_line: + print_red("is_not_title_by_check_pre_and_next_line") + else: + print_green("is_title_by_check_pre_and_next_line") + + # print_green("Common features:") + # print_green("↓" * 10) + + # print(f" curr_line_font_type: {curr_line_font_type}") + # print(f" curr_line_font_size: {curr_line_font_size}") + # print() + + """ + + return is_title, is_author_or_org_list + + def _detect_block_title(self, input_block): + """ + Use the functions 'is_potential_title' to detect titles of each paragraph block. + If a line is a title, then the value of key 'is_title' of the line will be set to True. + """ + + raw_lines = input_block["lines"] + + prev_line_is_title_flag = False + + for i, curr_line in enumerate(raw_lines): + prev_line = raw_lines[i - 1] if i > 0 else None + next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None + + blk_avg_char_width = input_block["avg_char_width"] + blk_avg_char_height = input_block["avg_char_height"] + blk_media_font_size = input_block["median_font_size"] + + is_title, is_author_or_org_list = self._is_potential_title( + curr_line, + prev_line, + prev_line_is_title_flag, + next_line, + blk_avg_char_width, + blk_avg_char_height, + blk_media_font_size, + ) + + if is_title: + curr_line["is_title"] = is_title + prev_line_is_title_flag = True + else: + curr_line["is_title"] = False + prev_line_is_title_flag = False + + if is_author_or_org_list: + curr_line["is_author_or_org_list"] = is_author_or_org_list + else: + curr_line["is_author_or_org_list"] = False + + return input_block + + def batch_process_blocks_detect_titles(self, pdf_dic): + """ + This function batch process the blocks to detect titles. + + Parameters + ---------- + pdf_dict : dict + result dictionary + + Returns + ------- + pdf_dict : dict + result dictionary + """ + num_titles = 0 + + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "para_blocks" in blocks.keys(): + para_blocks = blocks["para_blocks"] + + all_single_line_blocks = [] + for block in para_blocks: + if len(block["lines"]) == 1: + all_single_line_blocks.append(block) + + new_para_blocks = [] + if not len(all_single_line_blocks) == len(para_blocks): # Not all blocks are single line blocks. + for para_block in para_blocks: + new_block = self._detect_block_title(para_block) + new_para_blocks.append(new_block) + num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]]) + else: # All blocks are single line blocks. + for para_block in para_blocks: + new_para_blocks.append(para_block) + num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]]) + para_blocks = new_para_blocks + + blocks["para_blocks"] = para_blocks + + for para_block in para_blocks: + all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"]) + para_text_len = sum([len(line["text"]) for line in para_block["lines"]]) + if ( + all_titles and para_text_len < 200 + ): # total length of the paragraph is less than 200, more than this should not be a title + para_block["is_block_title"] = 1 + else: + para_block["is_block_title"] = 0 + + all_name_or_org_list_to_be_removed = all( + safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"] + ) + if all_name_or_org_list_to_be_removed and page_id == "page_0": + para_block["is_block_an_author_or_org_list"] = 1 + else: + para_block["is_block_an_author_or_org_list"] = 0 + + pdf_dic["statistics"]["num_titles"] = num_titles + + return pdf_dic + + def __determine_size_based_level(self, title_blocks): + """ + This function determines the title level based on the font size of the title. + + Parameters + ---------- + title_blocks : list + + Returns + ------- + title_blocks : list + """ + + font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks]) + + # Use the mean and std of font sizes to remove extreme values + mean_font_size = np.mean(font_sizes) + std_font_size = np.std(font_sizes) + min_extreme_font_size = mean_font_size - std_font_size # type: ignore + max_extreme_font_size = mean_font_size + std_font_size # type: ignore + + # Compute the threshold for title level + middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)] + if middle_font_sizes.size > 0: + middle_mean_font_size = np.mean(middle_font_sizes) + level_threshold = middle_mean_font_size + else: + level_threshold = mean_font_size + + for tb in title_blocks: + title_block = tb["block"] + title_font_size = safe_get(title_block, "block_font_size", 0) + + current_level = 1 # Initialize title level, the biggest level is 1 + + # print(f"Before adjustment by font size, {current_level}") + if title_font_size >= max_extreme_font_size: + current_level = 1 + elif title_font_size <= min_extreme_font_size: + current_level = 3 + elif float(title_font_size) >= float(level_threshold): + current_level = 2 + else: + current_level = 3 + # print(f"After adjustment by font size, {current_level}") + + title_block["block_title_level"] = current_level + + return title_blocks + + def batch_process_blocks_recog_title_level(self, pdf_dic): + title_blocks = [] + + # Collect all titles + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = blocks.get("para_blocks", []) + for block in para_blocks: + if block.get("is_block_title"): + title_obj = {"page_id": page_id, "block": block} + title_blocks.append(title_obj) + + # Determine title level + if title_blocks: + # Determine title level based on font size + title_blocks = self.__determine_size_based_level(title_blocks) + + return pdf_dic diff --git a/magic_pdf/pdf_parse_by_ocr.py b/magic_pdf/pdf_parse_by_ocr.py new file mode 100644 index 0000000000000000000000000000000000000000..42d9acbd29e74236ec376d965dda87889e1bd216 --- /dev/null +++ b/magic_pdf/pdf_parse_by_ocr.py @@ -0,0 +1,18 @@ +from magic_pdf.pdf_parse_union_core import pdf_parse_union + + +def parse_pdf_by_ocr(pdf_bytes, + model_list, + imageWriter, + start_page_id=0, + end_page_id=None, + debug_mode=False, + ): + return pdf_parse_union(pdf_bytes, + model_list, + imageWriter, + "ocr", + start_page_id=start_page_id, + end_page_id=end_page_id, + debug_mode=debug_mode, + ) diff --git a/magic_pdf/pdf_parse_by_txt.py b/magic_pdf/pdf_parse_by_txt.py new file mode 100644 index 0000000000000000000000000000000000000000..21d11766ae59a19a66855b554b9b606d185daec3 --- /dev/null +++ b/magic_pdf/pdf_parse_by_txt.py @@ -0,0 +1,19 @@ +from magic_pdf.pdf_parse_union_core import pdf_parse_union + + +def parse_pdf_by_txt( + pdf_bytes, + model_list, + imageWriter, + start_page_id=0, + end_page_id=None, + debug_mode=False, +): + return pdf_parse_union(pdf_bytes, + model_list, + imageWriter, + "txt", + start_page_id=start_page_id, + end_page_id=end_page_id, + debug_mode=debug_mode, + ) diff --git a/magic_pdf/pdf_parse_for_train.py b/magic_pdf/pdf_parse_for_train.py new file mode 100644 index 0000000000000000000000000000000000000000..58d88df7d760f5be540a4f46db735ff04f2f86cd --- /dev/null +++ b/magic_pdf/pdf_parse_for_train.py @@ -0,0 +1,685 @@ +import time + +# from anyio import Path + +from magic_pdf.libs.commons import ( + fitz, + get_delta_time, + get_img_s3_client, + get_docx_model_output, +) +import json +import os +from copy import deepcopy +import math +from loguru import logger +from magic_pdf.layout.bbox_sort import ( + prepare_bboxes_for_layout_split, +) +from magic_pdf.layout.layout_sort import ( + LAYOUT_UNPROC, + get_bboxes_layout, + get_columns_cnt_of_layout, + sort_text_block, +) +from magic_pdf.libs.drop_reason import DropReason +from magic_pdf.libs.markdown_utils import escape_special_markdown_char +from magic_pdf.libs.safe_filename import sanitize_filename +from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page +from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes +from magic_pdf.pre_proc.detect_images import parse_images +from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox +from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox +from magic_pdf.pre_proc.detect_header import parse_headers # 获取headers的bbox +from magic_pdf.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox +from magic_pdf.pre_proc.detect_footnote import ( + parse_footnotes_by_model, + parse_footnotes_by_rule, +) # 获取footnotes的bbox +from magic_pdf.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox + +from magic_pdf.post_proc.detect_para import ( + ParaProcessPipeline, + TitleDetectionException, + TitleLevelException, + ParaSplitException, + ParaMergeException, + DenseSingleLineBlockException, +) +from magic_pdf.pre_proc.main_text_font import get_main_text_font +from magic_pdf.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock +from magic_pdf.pre_proc.remove_footer_header import remove_headder_footer_one_page +from magic_pdf.train_utils.extract_caption import extract_caption_bbox + +""" +from para.para_pipeline import ParaProcessPipeline +from para.exceptions import ( + TitleDetectionException, + TitleLevelException, + ParaSplitException, + ParaMergeException, + DenseSingleLineBlockException, +) +""" + +from magic_pdf.libs.commons import read_file, join_path +from magic_pdf.post_proc.remove_footnote import ( + merge_footnote_blocks, + remove_footnote_blocks, +) +from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker +from magic_pdf.pre_proc.equations_replace import ( + combine_chars_to_pymudict, + remove_chars_in_text_blocks, + replace_equations_in_textblock, +) +from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter +from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header +from magic_pdf.pre_proc.construct_page_dict import construct_page_component +from magic_pdf.pre_proc.fix_image import ( + combine_images, + fix_image_vertical, + fix_seperated_image, + include_img_title, +) +from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter +from magic_pdf.pre_proc.remove_rotate_bbox import ( + get_side_boundry, + remove_rotate_side_textblock, + remove_side_blank_block, +) +from magic_pdf.pre_proc.resolve_bbox_conflict import ( + check_text_block_horizontal_overlap, + resolve_bbox_overlap_conflict, +) +from magic_pdf.pre_proc.fix_table import ( + fix_table_text_block, + fix_tables, + include_table_title, +) +from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval + +denseSingleLineBlockException_msg = DenseSingleLineBlockException().message +titleDetectionException_msg = TitleDetectionException().message +titleLevelException_msg = TitleLevelException().message +paraSplitException_msg = ParaSplitException().message +paraMergeException_msg = ParaMergeException().message + + +def parse_pdf_for_train( + s3_pdf_path, + s3_pdf_profile, + pdf_model_output, + save_path, + book_name, + image_s3_config=None, + start_page_id=0, + end_page_id=None, + junk_img_bojids=[], + debug_mode=False, +): + pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile) + save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") + md_bookname_save_path = "" + book_name = sanitize_filename(book_name) + if debug_mode: + save_path = join_path(save_tmp_path, "md") + pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name) + + if not os.path.exists(os.path.dirname(pdf_local_path)): + # 如果目录不存在,创建它 + os.makedirs(os.path.dirname(pdf_local_path)) + + md_bookname_save_path = join_path(save_tmp_path, "md", book_name) + if not os.path.exists(md_bookname_save_path): + # 如果目录不存在,创建它 + os.makedirs(md_bookname_save_path) + + with open(pdf_local_path + ".pdf", "wb") as pdf_file: + pdf_file.write(pdf_bytes) + + pdf_docs = fitz.open("pdf", pdf_bytes) + pdf_info_dict = {} + img_s3_client = get_img_s3_client( + save_path, image_s3_config + ) # 更改函数名和参数,避免歧义 + # img_s3_client = "img_s3_client" #不创建这个对象,直接用字符串占位 + + start_time = time.time() + + """通过统计pdf全篇文字,识别正文字体""" + main_text_font = get_main_text_font(pdf_docs) + + end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 + for page_id in range(start_page_id, end_page_id + 1): + page = pdf_docs[page_id] + page_width = page.rect.width + page_height = page.rect.height + + if debug_mode: + time_now = time.time() + logger.info( + f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}" + ) + start_time = time_now + """ + # 通过一个规则,过滤掉单页超过1500非junkimg的pdf + # 对单页面非重复id的img数量做统计,如果当前页超过1500则直接return need_drop + """ + page_imgs = page.get_images() + img_counts = 0 + for img in page_imgs: + img_bojid = img[0] + if img_bojid in junk_img_bojids: # 判断这个图片在不在junklist中 + continue # 如果在junklist就不用管了,跳过 + else: + recs = page.get_image_rects(img, transform=True) + if recs: # 如果这张图在当前页面有展示 + img_counts += 1 + if ( + img_counts >= 1500 + ): # 如果去除了junkimg的影响,单页img仍然超过1500的话,就排除当前pdf + logger.warning( + f"page_id: {page_id}, img_counts: {img_counts}, drop this pdf: {book_name}, drop_reason: {DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}" + ) + result = { + "_need_drop": True, + "_drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS, + } + if not debug_mode: + return result + + """ + ================================================================================================================================== + 首先获取基本的block数据,对pdf进行分解,获取图片、表格、公式、text的bbox + """ + # 解析pdf原始文本block + text_raw_blocks = page.get_text( + "dict", + flags=fitz.TEXTFLAGS_TEXT, + )["blocks"] + model_output_json = get_docx_model_output( + pdf_model_output, page_id + ) + + # 解析图片 + image_bboxes = parse_images(page_id, page, model_output_json, junk_img_bojids) + image_bboxes = fix_image_vertical( + image_bboxes, text_raw_blocks + ) # 修正图片的位置 + image_bboxes = fix_seperated_image(image_bboxes) # 合并有边重合的图片 + + old_image_bboxes = deepcopy(image_bboxes) + image_bboxes = include_img_title( + text_raw_blocks, image_bboxes + ) # 向图片上方和下方寻找title,使用规则进行匹配,暂时只支持英文规则 + """此时image_bboxes中可能出现这种情况,水平并列的2个图片,下方分别有各自的子标题,2个子标题下方又有大标题(形如Figxxx),会出现2个图片的bbox都包含了这个大标题,这种情况需要把图片合并""" + image_bboxes = combine_images(image_bboxes) # 合并图片 + + # 解析表格并对table_bboxes进行位置的微调,防止表格周围的文字被截断 + table_bboxes = parse_tables(page_id, page, model_output_json) + table_bboxes = fix_tables( + page, table_bboxes, include_table_title=False, scan_line_num=2 + ) # 修正 + table_bboxes = fix_table_text_block( + text_raw_blocks, table_bboxes + ) # 修正与text block的关系,某些table修正与pymupdf获取到的table内textblock没有完全包含,因此要进行一次修正。 + # debug_show_bbox(pdf_docs, page_id, table_bboxes, [], [b['bbox'] for b in text_raw_blocks], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7) + + old_table_bboxes = deepcopy(table_bboxes) + table_bboxes = include_table_title( + text_raw_blocks, table_bboxes + ) # 向table上方和下方寻找title,使用规则进行匹配,暂时只支持英文规则 + + # 解析公式 + equations_inline_bboxes, equations_interline_bboxes = parse_equations( + page_id, page, model_output_json + ) + + # get image box and caption ! + image_bboxes_with_caption = extract_caption_bbox(image_bboxes, old_image_bboxes) + + # get table box and caption ! + table_bboxes_with_caption = extract_caption_bbox(table_bboxes, old_table_bboxes) + + """ + ================================================================================================================================== + 进入预处理-1阶段 + ------------------- + # # 解析标题 + # title_bboxs = parse_titles(page_id, page, model_output_json) + # # 评估Layout是否规整、简单 + # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json) + 接下来开始进行预处理过程 + """ + # title_bboxs = parse_titles(page_id, page, model_output_json) + + """去掉每页的页码、页眉、页脚""" + page_no_bboxs = parse_pageNos(page_id, page, model_output_json) + header_bboxs = parse_headers(page_id, page, model_output_json) + footer_bboxs = parse_footers(page_id, page, model_output_json) + ( + image_bboxes, + table_bboxes, + remain_text_blocks, + removed_hdr_foot_txt_block, + removed_hdr_foot_img_block, + removed_hdr_foot_table, + ) = remove_headder_footer_one_page( + text_raw_blocks, + image_bboxes, + table_bboxes, + header_bboxs, + footer_bboxs, + page_no_bboxs, + page_width, + page_height, + ) + + """去除页面上半部分长条色块内的文本块""" + remain_text_blocks, removed_colored_narrow_strip_background_text_block = ( + remove_colored_strip_textblock(remain_text_blocks, page) + ) + + # debug_show_bbox(pdf_docs, page_id, footnote_bboxes_by_model, [b['bbox'] for b in remain_text_blocks], header_bboxs, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7) + + """去掉旋转的文字:水印、垂直排列的文字""" + remain_text_blocks, removed_non_horz_text_block = remove_rotate_side_textblock( + remain_text_blocks, page_width, page_height + ) # 去掉水印,非水平文字 + remain_text_blocks, removed_empty_side_block = remove_side_blank_block( + remain_text_blocks, page_width, page_height + ) # 删除页面四周可能会留下的完全空白的textblock,这种block形成原因未知 + + """出现在图片、表格上的文字块去掉,把层叠的图片单独分离出来,不参与layout的计算""" + ( + image_bboxes, + table_bboxes, + equations_interline_bboxes, + equations_inline_bboxes, + remain_text_blocks, + text_block_on_image_removed, + images_overlap_backup, + interline_eq_temp_text_block, + ) = resolve_bbox_overlap_conflict( + image_bboxes, + table_bboxes, + equations_interline_bboxes, + equations_inline_bboxes, + remain_text_blocks, + ) + + # """去掉footnote, 从文字和图片中""" + # # 通过模型识别到的footnote + # footnote_bboxes_by_model = parse_footnotes_by_model(page_id, page, model_output_json, md_bookname_save_path, + # debug_mode=debug_mode) + # # 通过规则识别到的footnote + # footnote_bboxes_by_rule = parse_footnotes_by_rule(remain_text_blocks, page_height, page_id) + """ + ================================================================================================================================== + """ + if debug_mode: # debugmode截图到本地 + save_path = join_path(save_tmp_path, "md") + + # 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容 + image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = ( + txt_save_images_by_bboxes( + book_name, + page_id, + page, + save_path, + image_bboxes, + images_overlap_backup, + table_bboxes, + equations_inline_bboxes, + equations_interline_bboxes, + # 传入img_s3_client + img_s3_client, + ) + ) # 只要表格和图片的截图 + + """"以下进入到公式替换环节 """ + char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[ + "blocks" + ] + remain_text_blocks = combine_chars_to_pymudict( + remain_text_blocks, char_level_text_blocks + ) # 合并chars + remain_text_blocks = replace_equations_in_textblock( + remain_text_blocks, inline_eq_info, interline_eq_info + ) + remain_text_blocks = remove_citation_marker( + remain_text_blocks + ) # 公式替换之后去角标,防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。 + remain_text_blocks = remove_chars_in_text_blocks( + remain_text_blocks + ) # 减少中间态数据体积 + # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3) + + """去掉footnote, 从文字和图片中(先去角标再去footnote试试)""" + # 通过模型识别到的footnote + footnote_bboxes_by_model = parse_footnotes_by_model( + page_id, + page, + model_output_json, + md_bookname_save_path, + debug_mode=debug_mode, + ) + # 通过规则识别到的footnote + footnote_bboxes_by_rule = parse_footnotes_by_rule( + remain_text_blocks, page_height, page_id, main_text_font + ) + """进入pdf过滤器,去掉一些不合理的pdf""" + is_good_pdf, err = pdf_filter( + page, remain_text_blocks, table_bboxes, image_bboxes + ) + if not is_good_pdf: + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {err}" + ) + if not debug_mode: + return err + + """ + ================================================================================================================================== + 进行版面布局切分和过滤 + """ + """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ + + is_text_block_horz_overlap = check_text_block_horizontal_overlap( + remain_text_blocks, header_bboxs, footer_bboxs + ) + + if is_text_block_horz_overlap: + # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in remain_text_blocks], [], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 0) + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}" + ) + result = { + "_need_drop": True, + "_drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP, + } + if not debug_mode: + return result + + """统一格式化成一个数据结构用于计算layout""" + page_y0 = 0 if len(header_bboxs) == 0 else max([b[3] for b in header_bboxs]) + page_y1 = ( + page_height if len(footer_bboxs) == 0 else min([b[1] for b in footer_bboxs]) + ) + left_x, right_x = get_side_boundry( + removed_non_horz_text_block, page_width, page_height + ) + page_boundry = [ + math.floor(left_x), + page_y0 + 1, + math.ceil(right_x), + page_y1 - 1, + ] + # 返回的是一个数组,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是段落的内容 + + all_bboxes = prepare_bboxes_for_layout_split( + image_info, + image_backup_info, + table_info, + inline_eq_info, + interline_eq_info, + remain_text_blocks, + page_boundry, + page, + ) + # debug_show_bbox(pdf_docs, page_id, [], [], all_bboxes, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 1) + """page_y0, page_y1能够过滤掉页眉和页脚,不会算作layout内""" + layout_bboxes, layout_tree = get_bboxes_layout( + all_bboxes, page_boundry, page_id + ) + + if ( + len(remain_text_blocks) > 0 + and len(all_bboxes) > 0 + and len(layout_bboxes) == 0 + ): + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}" + ) + result = { + "_need_drop": True, + "_drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT, + } + if not debug_mode: + return result + + """以下去掉复杂的布局和超过2列的布局""" + if any( + [lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes] + ): # 复杂的布局 + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.COMPLICATED_LAYOUT}" + ) + result = {"_need_drop": True, "_drop_reason": DropReason.COMPLICATED_LAYOUT} + if not debug_mode: + return result + + layout_column_width = get_columns_cnt_of_layout(layout_tree) + if layout_column_width > 2: # 去掉超过2列的布局pdf + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}" + ) + result = { + "_need_drop": True, + "_drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS, + "extra_info": {"column_cnt": layout_column_width}, + } + if not debug_mode: + return result + + """ + ================================================================================================================================== + 构造出下游需要的数据结构 + """ + remain_text_blocks = ( + remain_text_blocks + interline_eq_temp_text_block + ) # 把计算layout时候临时删除的行间公式再放回去,防止行间公式替换的时候丢失。 + removed_text_blocks = [] + removed_text_blocks.extend(removed_hdr_foot_txt_block) + # removed_text_blocks.extend(removed_footnote_text_block) + removed_text_blocks.extend(text_block_on_image_removed) + removed_text_blocks.extend(removed_non_horz_text_block) + removed_text_blocks.extend(removed_colored_narrow_strip_background_text_block) + + removed_images = [] + # removed_images.extend(footnote_imgs) + removed_images.extend(removed_hdr_foot_img_block) + + images_backup = [] + images_backup.extend(image_backup_info) + remain_text_blocks = escape_special_markdown_char( + remain_text_blocks + ) # 转义span里的text + sorted_text_remain_text_block = sort_text_block( + remain_text_blocks, layout_bboxes + ) + + footnote_bboxes_tmp = [] + footnote_bboxes_tmp.extend(footnote_bboxes_by_model) + footnote_bboxes_tmp.extend(footnote_bboxes_by_rule) + + page_info = construct_page_component( + page_id, + image_info, + table_info, + sorted_text_remain_text_block, + layout_bboxes, + inline_eq_info, + interline_eq_info, + page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"], + removed_text_blocks=removed_text_blocks, + removed_image_blocks=removed_images, + images_backup=images_backup, + droped_table_block=[], + table_backup=[], + layout_tree=layout_tree, + page_w=page.rect.width, + page_h=page.rect.height, + footnote_bboxes_tmp=footnote_bboxes_tmp, + ) + + page_info["image_bboxes_with_caption"] = image_bboxes_with_caption # add by xr + page_info["table_bboxes_with_caption"] = table_bboxes_with_caption + + page_info["bak_page_no_bboxes"] = page_no_bboxs + page_info["bak_header_bboxes"] = header_bboxs + page_info["bak_footer_bboxes"] = footer_bboxs + page_info["bak_footer_note_bboxes"] = footnote_bboxes_tmp + + pdf_info_dict[f"page_{page_id}"] = page_info + + # end page for + + """计算后处理阶段耗时""" + start_time = time.time() + + """ + ================================================================================================================================== + 去掉页眉和页脚,这里需要用到一定的统计量,所以放到最后 + 页眉和页脚主要从文本box和图片box中去除,位于页面的四周。 + 下面函数会直接修改pdf_info_dict,从文字块中、图片中删除属于页眉页脚的内容,删除内容做相对应记录 + """ + # 去页眉页脚 + header, footer = drop_footer_header( + pdf_info_dict + ) # TODO: using header and footer boxes here ! + + """对单个layout内footnote和他下面的所有textbbox合并""" + + for page_key, page_info in pdf_info_dict.items(): + page_info = merge_footnote_blocks(page_info, main_text_font) + page_info = remove_footnote_blocks(page_info) + pdf_info_dict[page_key] = page_info + + """进入pdf后置过滤器,去掉一些不合理的pdf""" + + i = 0 + for page_info in pdf_info_dict.values(): + is_good_pdf, err = pdf_post_filter(page_info) + if not is_good_pdf: + logger.warning(f"page_id: {i}, drop this pdf: {book_name}, reason: {err}") + if not debug_mode: + return err + i += 1 + + if debug_mode: + params_file_save_path = join_path( + save_tmp_path, "md", book_name, "preproc_out.json" + ) + page_draw_rect_save_path = join_path( + save_tmp_path, "md", book_name, "layout.pdf" + ) + # dir_path = os.path.dirname(page_draw_rect_save_path) + # if not os.path.exists(dir_path): + # # 如果目录不存在,创建它 + # os.makedirs(dir_path) + + with open(params_file_save_path, "w", encoding="utf-8") as f: + json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) + # 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除 + if os.path.exists(page_draw_rect_save_path): + os.remove(page_draw_rect_save_path) + # 绘制bbox和layout到pdf + draw_bbox_on_page(pdf_docs, pdf_info_dict, page_draw_rect_save_path) + draw_layout_bbox_on_page( + pdf_docs, pdf_info_dict, header, footer, page_draw_rect_save_path + ) + + if debug_mode: + # 打印后处理阶段耗时 + logger.info(f"post_processing_time: {get_delta_time(start_time)}") + + """ + ================================================================================================================================== + 进入段落处理-2阶段 + """ + + # 处理行内文字间距较大问题 + pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict) + + start_time = time.time() + + para_process_pipeline = ParaProcessPipeline() + + def _deal_with_text_exception(error_info): + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {error_info}" + ) + if error_info == denseSingleLineBlockException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}" + ) + result = { + "_need_drop": True, + "_drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK, + } + return result + if error_info == titleDetectionException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_DETECTION_FAILED}" + ) + result = { + "_need_drop": True, + "_drop_reason": DropReason.TITLE_DETECTION_FAILED, + } + return result + elif error_info == titleLevelException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_LEVEL_FAILED}" + ) + result = {"_need_drop": True, "_drop_reason": DropReason.TITLE_LEVEL_FAILED} + return result + elif error_info == paraSplitException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.PARA_SPLIT_FAILED}" + ) + result = {"_need_drop": True, "_drop_reason": DropReason.PARA_SPLIT_FAILED} + return result + elif error_info == paraMergeException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.PARA_MERGE_FAILED}" + ) + result = {"_need_drop": True, "_drop_reason": DropReason.PARA_MERGE_FAILED} + return result + + if debug_mode: + input_pdf_file = f"{pdf_local_path}.pdf" + output_dir = f"{save_path}/{book_name}" + output_pdf_file = f"{output_dir}/pdf_annos.pdf" + + """ + Call the para_process_pipeline function to process the pdf_info_dict. + + Parameters: + para_debug_mode: str or None + If para_debug_mode is None, the para_process_pipeline will not keep any intermediate results. + If para_debug_mode is "simple", the para_process_pipeline will only keep the annos on the pdf and the final results as a json file. + If para_debug_mode is "full", the para_process_pipeline will keep all the intermediate results generated during each step. + """ + pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline( + pdf_info_dict, + para_debug_mode="simple", + input_pdf_path=input_pdf_file, + output_pdf_path=output_pdf_file, + ) + # 打印段落处理阶段耗时 + logger.info(f"para_process_time: {get_delta_time(start_time)}") + + # debug的时候不return drop信息 + if error_info is not None: + _deal_with_text_exception(error_info) + return pdf_info_dict + else: + pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline( + pdf_info_dict + ) + if error_info is not None: + return _deal_with_text_exception(error_info) + + return pdf_info_dict diff --git a/magic_pdf/pdf_parse_union_core.py b/magic_pdf/pdf_parse_union_core.py new file mode 100644 index 0000000000000000000000000000000000000000..60de2aa5360f0a72ebb58c92bc13f4913cf52f21 --- /dev/null +++ b/magic_pdf/pdf_parse_union_core.py @@ -0,0 +1,241 @@ +import time + +from loguru import logger + +from magic_pdf.libs.commons import fitz, get_delta_time +from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout +from magic_pdf.libs.convert_utils import dict_to_list +from magic_pdf.libs.drop_reason import DropReason +from magic_pdf.libs.hash_utils import compute_md5 +from magic_pdf.libs.math import float_equal +from magic_pdf.libs.ocr_content_type import ContentType +from magic_pdf.model.magic_model import MagicModel +from magic_pdf.para.para_split_v2 import para_split +from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker +from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2 +from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table +from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \ + combine_chars_to_pymudict +from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split +from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \ + fix_discarded_block +from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \ + remove_overlaps_low_confidence_spans +from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap + + +def remove_horizontal_overlap_block_which_smaller(all_bboxes): + useful_blocks = [] + for bbox in all_bboxes: + useful_blocks.append({ + "bbox": bbox[:4] + }) + is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks) + if is_useful_block_horz_overlap: + logger.warning( + f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}") + for bbox in all_bboxes.copy(): + if smaller_bbox == bbox[:4]: + all_bboxes.remove(bbox) + + return is_useful_block_horz_overlap, all_bboxes + + +def txt_spans_extract(pdf_page, inline_equations, interline_equations): + text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] + char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[ + "blocks" + ] + text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks) + text_blocks = replace_equations_in_textblock( + text_blocks, inline_equations, interline_equations + ) + text_blocks = remove_citation_marker(text_blocks) + text_blocks = remove_chars_in_text_blocks(text_blocks) + spans = [] + for v in text_blocks: + for line in v["lines"]: + for span in line["spans"]: + bbox = span["bbox"] + if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]): + continue + if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation): + spans.append( + { + "bbox": list(span["bbox"]), + "content": span["text"], + "type": ContentType.Text, + "score": 1.0, + } + ) + return spans + + +def replace_text_span(pymu_spans, ocr_spans): + return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans + + +def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode): + need_drop = False + drop_reason = [] + + '''从magic_model对象中获取后面会用到的区块信息''' + img_blocks = magic_model.get_imgs(page_id) + table_blocks = magic_model.get_tables(page_id) + discarded_blocks = magic_model.get_discarded(page_id) + text_blocks = magic_model.get_text_blocks(page_id) + title_blocks = magic_model.get_title_blocks(page_id) + inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id) + + page_w, page_h = magic_model.get_page_size(page_id) + + spans = magic_model.get_all_spans(page_id) + + '''根据parse_mode,构造spans''' + if parse_mode == "txt": + """ocr 中文本类的 span 用 pymu spans 替换!""" + pymu_spans = txt_spans_extract( + pdf_docs[page_id], inline_equations, interline_equations + ) + spans = replace_text_span(pymu_spans, spans) + elif parse_mode == "ocr": + pass + else: + raise Exception("parse_mode must be txt or ocr") + + '''删除重叠spans中置信度较低的那些''' + spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans) + '''删除重叠spans中较小的那些''' + spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans) + '''对image和table截图''' + spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter) + + '''将所有区块的bbox整理到一起''' + # @todo interline_equation_blocks参数不够准,后面切换到interline_equations上 + if len(interline_equation_blocks) > 0: + all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split( + img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, + interline_equation_blocks, page_w, page_h) + else: + all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split( + img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks, + interline_equations, page_w, page_h) + if len(drop_reasons) > 0: + need_drop = True + drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION) + + '''先处理不需要排版的discarded_blocks''' + discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4) + fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans) + + '''如果当前页面没有bbox则跳过''' + if len(all_bboxes) == 0: + logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}") + return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [], + [], [], interline_equations, fix_discarded_blocks, + need_drop, drop_reason) + + """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ + + while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况 + is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes) + if is_useful_block_horz_overlap: + need_drop = True + drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP) + else: + break + + '''根据区块信息计算layout''' + page_boundry = [0, 0, page_w, page_h] + layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id) + + if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0: + logger.warning( + f"skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}") + need_drop = True + drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT) + + """以下去掉复杂的布局和超过2列的布局""" + if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局 + logger.warning( + f"skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}") + need_drop = True + drop_reason.append(DropReason.COMPLICATED_LAYOUT) + + layout_column_width = get_columns_cnt_of_layout(layout_tree) + if layout_column_width > 2: # 去掉超过2列的布局pdf + logger.warning( + f"skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}") + need_drop = True + drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS) + + '''根据layout顺序,对当前页面所有需要留下的block进行排序''' + sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes) + + '''将span填入排好序的blocks中''' + block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6) + + '''对block进行fix操作''' + fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks) + + '''获取QA需要外置的list''' + images, tables, interline_equations = get_qa_need_list_v2(fix_blocks) + + '''构造pdf_info_dict''' + page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, + images, tables, interline_equations, fix_discarded_blocks, + need_drop, drop_reason) + return page_info + + +def pdf_parse_union(pdf_bytes, + model_list, + imageWriter, + parse_mode, + start_page_id=0, + end_page_id=None, + debug_mode=False, + ): + pdf_bytes_md5 = compute_md5(pdf_bytes) + pdf_docs = fitz.open("pdf", pdf_bytes) + + '''初始化空的pdf_info_dict''' + pdf_info_dict = {} + + '''用model_list和docs对象初始化magic_model''' + magic_model = MagicModel(model_list, pdf_docs) + + '''根据输入的起始范围解析pdf''' + end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 + + '''初始化启动时间''' + start_time = time.time() + + for page_id in range(start_page_id, end_page_id + 1): + + '''debug时输出每页解析的耗时''' + if debug_mode: + time_now = time.time() + logger.info( + f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}" + ) + start_time = time_now + + '''解析pdf中的每一页''' + page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode) + pdf_info_dict[f"page_{page_id}"] = page_info + + """分段""" + para_split(pdf_info_dict, debug_mode=debug_mode) + + """dict转list""" + pdf_info_list = dict_to_list(pdf_info_dict) + new_pdf_info_dict = { + "pdf_info": pdf_info_list, + } + + return new_pdf_info_dict + + +if __name__ == '__main__': + pass diff --git a/magic_pdf/pipe/AbsPipe.py b/magic_pdf/pipe/AbsPipe.py new file mode 100644 index 0000000000000000000000000000000000000000..f802a9a55d3c9c5745f40ff8be239f1b29565644 --- /dev/null +++ b/magic_pdf/pipe/AbsPipe.py @@ -0,0 +1,107 @@ +from abc import ABC, abstractmethod + +from magic_pdf.dict2md.ocr_mkcontent import union_make +from magic_pdf.filter.pdf_classify_by_type import classify +from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan +from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.libs.drop_reason import DropReason +from magic_pdf.libs.json_compressor import JsonCompressor + + +class AbsPipe(ABC): + """ + txt和ocr处理的抽象类 + """ + PIP_OCR = "ocr" + PIP_TXT = "txt" + + def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False): + self.pdf_bytes = pdf_bytes + self.model_list = model_list + self.image_writer = image_writer + self.pdf_mid_data = None # 未压缩 + self.is_debug = is_debug + + def get_compress_pdf_mid_data(self): + return JsonCompressor.compress_json(self.pdf_mid_data) + + @abstractmethod + def pipe_classify(self): + """ + 有状态的分类 + """ + raise NotImplementedError + + @abstractmethod + def pipe_analyze(self): + """ + 有状态的跑模型分析 + """ + raise NotImplementedError + + @abstractmethod + def pipe_parse(self): + """ + 有状态的解析 + """ + raise NotImplementedError + + def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): + content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode) + return content_list + + def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): + md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode) + return md_content + + @staticmethod + def classify(pdf_bytes: bytes) -> str: + """ + 根据pdf的元数据,判断是文本pdf,还是ocr pdf + """ + pdf_meta = pdf_meta_scan(pdf_bytes) + if pdf_meta.get("_need_drop", False): # 如果返回了需要丢弃的标志,则抛出异常 + raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}") + else: + is_encrypted = pdf_meta["is_encrypted"] + is_needs_password = pdf_meta["is_needs_password"] + if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 + raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}") + else: + is_text_pdf, results = classify( + pdf_meta["total_page"], + pdf_meta["page_width_pts"], + pdf_meta["page_height_pts"], + pdf_meta["image_info_per_page"], + pdf_meta["text_len_per_page"], + pdf_meta["imgs_per_page"], + pdf_meta["text_layout_per_page"], + pdf_meta["invalid_chars"], + ) + if is_text_pdf: + return AbsPipe.PIP_TXT + else: + return AbsPipe.PIP_OCR + + @staticmethod + def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list: + """ + 根据pdf类型,生成统一格式content_list + """ + pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) + pdf_info_list = pdf_mid_data["pdf_info"] + content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path) + return content_list + + @staticmethod + def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list: + """ + 根据pdf类型,markdown + """ + pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data) + pdf_info_list = pdf_mid_data["pdf_info"] + md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path) + return md_content + + diff --git a/magic_pdf/pipe/OCRPipe.py b/magic_pdf/pipe/OCRPipe.py new file mode 100644 index 0000000000000000000000000000000000000000..a46bdfd9788bb58d0594b65b47d60ef5465fb8c3 --- /dev/null +++ b/magic_pdf/pipe/OCRPipe.py @@ -0,0 +1,32 @@ +from loguru import logger + +from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.pipe.AbsPipe import AbsPipe +from magic_pdf.user_api import parse_ocr_pdf + + +class OCRPipe(AbsPipe): + + def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False): + super().__init__(pdf_bytes, model_list, image_writer, is_debug) + + def pipe_classify(self): + pass + + def pipe_analyze(self): + self.model_list = doc_analyze(self.pdf_bytes, ocr=True) + + def pipe_parse(self): + self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) + + def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): + result = super().pipe_mk_uni_format(img_parent_path, drop_mode) + logger.info("ocr_pipe mk content list finished") + return result + + def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): + result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) + logger.info(f"ocr_pipe mk {md_make_mode} finished") + return result diff --git a/magic_pdf/pipe/TXTPipe.py b/magic_pdf/pipe/TXTPipe.py new file mode 100644 index 0000000000000000000000000000000000000000..647b50699c217cc25a73fdac1bb038c8f661f109 --- /dev/null +++ b/magic_pdf/pipe/TXTPipe.py @@ -0,0 +1,33 @@ +from loguru import logger + +from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.libs.json_compressor import JsonCompressor +from magic_pdf.pipe.AbsPipe import AbsPipe +from magic_pdf.user_api import parse_txt_pdf + + +class TXTPipe(AbsPipe): + + def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False): + super().__init__(pdf_bytes, model_list, image_writer, is_debug) + + def pipe_classify(self): + pass + + def pipe_analyze(self): + self.model_list = doc_analyze(self.pdf_bytes, ocr=False) + + def pipe_parse(self): + self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug) + + def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): + result = super().pipe_mk_uni_format(img_parent_path, drop_mode) + logger.info("txt_pipe mk content list finished") + return result + + def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): + result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) + logger.info(f"txt_pipe mk {md_make_mode} finished") + return result diff --git a/magic_pdf/pipe/UNIPipe.py b/magic_pdf/pipe/UNIPipe.py new file mode 100644 index 0000000000000000000000000000000000000000..243a8964edd33796e0f0be20ee29f2e7e603fa50 --- /dev/null +++ b/magic_pdf/pipe/UNIPipe.py @@ -0,0 +1,85 @@ +import json + +from loguru import logger + +from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter +from magic_pdf.libs.commons import join_path +from magic_pdf.pipe.AbsPipe import AbsPipe +from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf + + +class UNIPipe(AbsPipe): + + def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False): + self.pdf_type = jso_useful_key["_pdf_type"] + super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug) + if len(self.model_list) == 0: + self.input_model_is_empty = True + else: + self.input_model_is_empty = False + + def pipe_classify(self): + self.pdf_type = AbsPipe.classify(self.pdf_bytes) + + def pipe_analyze(self): + if self.pdf_type == self.PIP_TXT: + self.model_list = doc_analyze(self.pdf_bytes, ocr=False) + elif self.pdf_type == self.PIP_OCR: + self.model_list = doc_analyze(self.pdf_bytes, ocr=True) + + def pipe_parse(self): + if self.pdf_type == self.PIP_TXT: + self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer, + is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty) + elif self.pdf_type == self.PIP_OCR: + self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, + is_debug=self.is_debug) + + def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF): + result = super().pipe_mk_uni_format(img_parent_path, drop_mode) + logger.info("uni_pipe mk content list finished") + return result + + def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD): + result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode) + logger.info(f"uni_pipe mk {md_make_mode} finished") + return result + + +if __name__ == '__main__': + # 测试 + drw = DiskReaderWriter(r"D:/project/20231108code-clean") + + pdf_file_path = r"linshixuqiu\19983-00.pdf" + model_file_path = r"linshixuqiu\19983-00.json" + pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN) + model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT) + model_list = json.loads(model_json_txt) + write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00" + img_bucket_path = "imgs" + img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path)) + + # pdf_type = UNIPipe.classify(pdf_bytes) + # jso_useful_key = { + # "_pdf_type": pdf_type, + # "model_list": model_list + # } + + jso_useful_key = { + "_pdf_type": "", + "model_list": model_list + } + pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer) + pipe.pipe_classify() + pipe.pipe_parse() + md_content = pipe.pipe_mk_markdown(img_bucket_path) + content_list = pipe.pipe_mk_uni_format(img_bucket_path) + + md_writer = DiskReaderWriter(write_path) + md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT) + md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json", + AbsReaderWriter.MODE_TXT) + md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT) diff --git a/magic_pdf/pipe/__init__.py b/magic_pdf/pipe/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/post_proc/__init__.py b/magic_pdf/post_proc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/post_proc/detect_para.py b/magic_pdf/post_proc/detect_para.py new file mode 100644 index 0000000000000000000000000000000000000000..17b41d27cff8c87ab9bdd4f040e9ed14904f91cd --- /dev/null +++ b/magic_pdf/post_proc/detect_para.py @@ -0,0 +1,3472 @@ +import os +import sys +import json +import re +import math +import unicodedata +from collections import Counter + + +import numpy as np +from termcolor import cprint + + +from magic_pdf.libs.commons import fitz +from magic_pdf.libs.nlp_utils import NLPModels + + +if sys.version_info[0] >= 3: + sys.stdout.reconfigure(encoding="utf-8") # type: ignore + + +def open_pdf(pdf_path): + try: + pdf_document = fitz.open(pdf_path) # type: ignore + return pdf_document + except Exception as e: + print(f"无法打开PDF文件:{pdf_path}。原因是:{e}") + raise e + + +def print_green_on_red(text): + cprint(text, "green", "on_red", attrs=["bold"], end="\n\n") + + +def print_green(text): + print() + cprint(text, "green", attrs=["bold"], end="\n\n") + + +def print_red(text): + print() + cprint(text, "red", attrs=["bold"], end="\n\n") + + +def print_yellow(text): + print() + cprint(text, "yellow", attrs=["bold"], end="\n\n") + + +def safe_get(dict_obj, key, default): + val = dict_obj.get(key) + if val is None: + return default + else: + return val + + +def is_bbox_overlap(bbox1, bbox2): + """ + This function checks if bbox1 and bbox2 overlap or not + + Parameters + ---------- + bbox1 : list + bbox1 + bbox2 : list + bbox2 + + Returns + ------- + bool + True if bbox1 and bbox2 overlap, else False + """ + x0_1, y0_1, x1_1, y1_1 = bbox1 + x0_2, y0_2, x1_2, y1_2 = bbox2 + + if x0_1 > x1_2 or x0_2 > x1_1: + return False + if y0_1 > y1_2 or y0_2 > y1_1: + return False + + return True + + +def is_in_bbox(bbox1, bbox2): + """ + This function checks if bbox1 is in bbox2 + + Parameters + ---------- + bbox1 : list + bbox1 + bbox2 : list + bbox2 + + Returns + ------- + bool + True if bbox1 is in bbox2, else False + """ + x0_1, y0_1, x1_1, y1_1 = bbox1 + x0_2, y0_2, x1_2, y1_2 = bbox2 + + if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2: + return True + else: + return False + + +def calculate_para_bbox(lines): + """ + This function calculates the minimum bbox of the paragraph + + Parameters + ---------- + lines : list + lines + + Returns + ------- + para_bbox : list + bbox of the paragraph + """ + x0 = min(line["bbox"][0] for line in lines) + y0 = min(line["bbox"][1] for line in lines) + x1 = max(line["bbox"][2] for line in lines) + y1 = max(line["bbox"][3] for line in lines) + return [x0, y0, x1, y1] + + +def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2): + """ + This function checks if the line is right aligned from its neighbors + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + direction : int + 0 for prev, 1 for next, 2 for both + + Returns + ------- + bool + True if the line is right aligned from its neighbors, False otherwise. + """ + horizontal_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + + _, _, x1, _ = curr_line_bbox + _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0) + _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0) + + if direction == 0: + return abs(x1 - prev_x1) < horizontal_thres + elif direction == 1: + return abs(x1 - next_x1) < horizontal_thres + elif direction == 2: + return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres + else: + return False + + +def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2): + """ + This function checks if the line is left aligned from its neighbors + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + direction : int + 0 for prev, 1 for next, 2 for both + + Returns + ------- + bool + True if the line is left aligned from its neighbors, False otherwise. + """ + horizontal_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + + x0, _, _, _ = curr_line_bbox + prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0) + next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0) + + if direction == 0: + return abs(x0 - prev_x0) < horizontal_thres + elif direction == 1: + return abs(x0 - next_x0) < horizontal_thres + elif direction == 2: + return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres + else: + return False + + +def end_with_punctuation(line_text): + """ + This function checks if the line ends with punctuation marks + """ + + english_end_puncs = [".", "?", "!"] + chinese_end_puncs = ["。", "?", "!"] + end_puncs = english_end_puncs + chinese_end_puncs + + last_non_space_char = None + for ch in line_text[::-1]: + if not ch.isspace(): + last_non_space_char = ch + break + + if last_non_space_char is None: + return False + + return last_non_space_char in end_puncs + + +def is_nested_list(lst): + if isinstance(lst, list): + return any(isinstance(sub, list) for sub in lst) + return False + + +class DenseSingleLineBlockException(Exception): + """ + This class defines the exception type for dense single line-block. + """ + + def __init__(self, message="DenseSingleLineBlockException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class TitleDetectionException(Exception): + """ + This class defines the exception type for title detection. + """ + + def __init__(self, message="TitleDetectionException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class TitleLevelException(Exception): + """ + This class defines the exception type for title level. + """ + + def __init__(self, message="TitleLevelException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class ParaSplitException(Exception): + """ + This class defines the exception type for paragraph splitting. + """ + + def __init__(self, message="ParaSplitException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class ParaMergeException(Exception): + """ + This class defines the exception type for paragraph merging. + """ + + def __init__(self, message="ParaMergeException"): + self.message = message + super().__init__(self.message) + + def __str__(self): + return f"{self.message}" + + def __repr__(self): + return f"{self.message}" + + +class DiscardByException: + """ + This class discards pdf files by exception + """ + + def __init__(self) -> None: + pass + + def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException): + """ + This function discards pdf files by single line block exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + exception_page_nums = 0 + page_num = 0 + for page_id, page in pdf_dic.items(): + if page_id.startswith("page_"): + page_num += 1 + if "preproc_blocks" in page.keys(): + preproc_blocks = page["preproc_blocks"] + + all_single_line_blocks = [] + for block in preproc_blocks: + if len(block["lines"]) == 1: + all_single_line_blocks.append(block) + + if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9: + exception_page_nums += 1 + + if page_num == 0: + return None + + if exception_page_nums / page_num > 0.1: # Low ratio means basically, whenever this is the case, it is discarded + return exception.message + + return None + + def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException): + """ + This function discards pdf files by title detection exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + def discard_by_title_level(self, pdf_dic, exception: TitleLevelException): + """ + This function discards pdf files by title level exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + def discard_by_split_para(self, pdf_dic, exception: ParaSplitException): + """ + This function discards pdf files by split para exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException): + """ + This function discards pdf files by merge para exception + + Parameters + ---------- + pdf_dic : dict + pdf dictionary + exception : str + exception message + + Returns + ------- + error_message : str + """ + # return exception.message + return None + + +class LayoutFilterProcessor: + def __init__(self) -> None: + pass + + def batch_process_blocks(self, pdf_dict): + """ + This function processes the blocks in batch. + + Parameters + ---------- + self : object + The instance of the class. + + pdf_dict : dict + pdf dictionary + + Returns + ------- + pdf_dict : dict + pdf dictionary + """ + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys(): + layout_bbox_objs = blocks["layout_bboxes"] + if layout_bbox_objs is None: + continue + layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs] + + # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text. + layout_bboxes = [ + [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes + ] + + para_blocks = blocks["para_blocks"] + if para_blocks is None: + continue + + for lb_bbox in layout_bboxes: + for i, para_block in enumerate(para_blocks): + para_bbox = para_block["bbox"] + para_blocks[i]["in_layout"] = 0 + if is_in_bbox(para_bbox, lb_bbox): + para_blocks[i]["in_layout"] = 1 + + blocks["para_blocks"] = para_blocks + + return pdf_dict + + +class RawBlockProcessor: + def __init__(self) -> None: + self.y_tolerance = 2 + self.pdf_dic = {} + + def __span_flags_decomposer(self, span_flags): + """ + Make font flags human readable. + + Parameters + ---------- + self : object + The instance of the class. + + span_flags : int + span flags + + Returns + ------- + l : dict + decomposed flags + """ + + l = { + "is_superscript": False, + "is_italic": False, + "is_serifed": False, + "is_sans_serifed": False, + "is_monospaced": False, + "is_proportional": False, + "is_bold": False, + } + + if span_flags & 2**0: + l["is_superscript"] = True # 表示上标 + + if span_flags & 2**1: + l["is_italic"] = True # 表示斜体 + + if span_flags & 2**2: + l["is_serifed"] = True # 表示衬线字体 + else: + l["is_sans_serifed"] = True # 表示非衬线字体 + + if span_flags & 2**3: + l["is_monospaced"] = True # 表示等宽字体 + else: + l["is_proportional"] = True # 表示比例字体 + + if span_flags & 2**4: + l["is_bold"] = True # 表示粗体 + + return l + + def __make_new_lines(self, raw_lines): + """ + This function makes new lines. + + Parameters + ---------- + self : object + The instance of the class. + + raw_lines : list + raw lines + + Returns + ------- + new_lines : list + new lines + """ + new_lines = [] + new_line = None + + for raw_line in raw_lines: + raw_line_bbox = raw_line["bbox"] + raw_line_spans = raw_line["spans"] + raw_line_text = "".join([span["text"] for span in raw_line_spans]) + raw_line_dir = raw_line.get("dir", None) + + decomposed_line_spans = [] + for span in raw_line_spans: + raw_flags = span["flags"] + decomposed_flags = self.__span_flags_decomposer(raw_flags) + span["decomposed_flags"] = decomposed_flags + decomposed_line_spans.append(span) + + if new_line is None: # Handle the first line + new_line = { + "bbox": raw_line_bbox, + "text": raw_line_text, + "dir": raw_line_dir if raw_line_dir else (0, 0), + "spans": decomposed_line_spans, + } + else: # Handle the rest lines + if ( + abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance + and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance + ): + new_line["bbox"] = ( + min(new_line["bbox"][0], raw_line_bbox[0]), # left + new_line["bbox"][1], # top + max(new_line["bbox"][2], raw_line_bbox[2]), # right + raw_line_bbox[3], # bottom + ) + new_line["text"] += raw_line_text + new_line["spans"].extend(raw_line_spans) + new_line["dir"] = ( + new_line["dir"][0] + raw_line_dir[0], + new_line["dir"][1] + raw_line_dir[1], + ) + else: + new_lines.append(new_line) + new_line = { + "bbox": raw_line_bbox, + "text": raw_line_text, + "dir": raw_line_dir if raw_line_dir else (0, 0), + "spans": raw_line_spans, + } + if new_line: + new_lines.append(new_line) + + return new_lines + + def __make_new_block(self, raw_block): + """ + This function makes a new block. + + Parameters + ---------- + self : object + The instance of the class. + ---------- + raw_block : dict + a raw block + + Returns + ------- + new_block : dict + """ + new_block = {} + + block_id = raw_block["number"] + block_bbox = raw_block["bbox"] + block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"]) + raw_lines = raw_block["lines"] + block_lines = self.__make_new_lines(raw_lines) + + new_block["block_id"] = block_id + new_block["bbox"] = block_bbox + new_block["text"] = block_text + new_block["lines"] = block_lines + + return new_block + + def batch_process_blocks(self, pdf_dic): + """ + This function processes the blocks in batch. + + Parameters + ---------- + self : object + The instance of the class. + ---------- + blocks : list + Input block is a list of raw blocks. + + Returns + ------- + result_dict : dict + result dictionary + """ + + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "preproc_blocks" in blocks.keys(): + input_blocks = blocks["preproc_blocks"] + for raw_block in input_blocks: + new_block = self.__make_new_block(raw_block) + para_blocks.append(new_block) + + blocks["para_blocks"] = para_blocks + + return pdf_dic + + +class BlockStatisticsCalculator: + """ + This class calculates the statistics of the block. + """ + + def __init__(self) -> None: + pass + + def __calc_stats_of_new_lines(self, new_lines): + """ + This function calculates the paragraph metrics + + Parameters + ---------- + combined_lines : list + combined lines + + Returns + ------- + X0 : float + Median of x0 values, which represents the left average boundary of the block + X1 : float + Median of x1 values, which represents the right average boundary of the block + avg_char_width : float + Average of char widths, which represents the average char width of the block + avg_char_height : float + Average of line heights, which represents the average line height of the block + + """ + x0_values = [] + x1_values = [] + char_widths = [] + char_heights = [] + + block_font_types = [] + block_font_sizes = [] + block_directions = [] + + if len(new_lines) > 0: + for i, line in enumerate(new_lines): + line_bbox = line["bbox"] + line_text = line["text"] + line_spans = line["spans"] + + num_chars = len([ch for ch in line_text if not ch.isspace()]) + + x0_values.append(line_bbox[0]) + x1_values.append(line_bbox[2]) + + if num_chars > 0: + char_width = (line_bbox[2] - line_bbox[0]) / num_chars + char_widths.append(char_width) + + for span in line_spans: + block_font_types.append(span["font"]) + block_font_sizes.append(span["size"]) + + if "dir" in line: + block_directions.append(line["dir"]) + + # line_font_types = [span["font"] for span in line_spans] + char_heights = [span["size"] for span in line_spans] + + X0 = np.median(x0_values) if x0_values else 0 + X1 = np.median(x1_values) if x1_values else 0 + avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0 + avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0 + + # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None + + max_span_length = 0 + max_span_font_type = None + for line in new_lines: + line_spans = line["spans"] + for span in line_spans: + span_length = span["bbox"][2] - span["bbox"][0] + if span_length > max_span_length: + max_span_length = span_length + max_span_font_type = span["font"] + + max_freq_font_type = max_span_font_type + + avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None + + avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0 + avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0 + + median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None + + return ( + X0, + X1, + avg_char_width, + avg_char_height, + max_freq_font_type, + avg_font_size, + (avg_dir_horizontal, avg_dir_vertical), + median_font_size, + ) + + def __make_new_block(self, input_block): + new_block = {} + + raw_lines = input_block["lines"] + stats = self.__calc_stats_of_new_lines(raw_lines) + + block_id = input_block["block_id"] + block_bbox = input_block["bbox"] + block_text = input_block["text"] + block_lines = raw_lines + block_avg_left_boundary = stats[0] + block_avg_right_boundary = stats[1] + block_avg_char_width = stats[2] + block_avg_char_height = stats[3] + block_font_type = stats[4] + block_font_size = stats[5] + block_direction = stats[6] + block_median_font_size = stats[7] + + new_block["block_id"] = block_id + new_block["bbox"] = block_bbox + new_block["text"] = block_text + new_block["dir"] = block_direction + new_block["X0"] = block_avg_left_boundary + new_block["X1"] = block_avg_right_boundary + new_block["avg_char_width"] = block_avg_char_width + new_block["avg_char_height"] = block_avg_char_height + new_block["block_font_type"] = block_font_type + new_block["block_font_size"] = block_font_size + new_block["lines"] = block_lines + new_block["median_font_size"] = block_median_font_size + + return new_block + + def batch_process_blocks(self, pdf_dic): + """ + This function processes the blocks in batch. + + Parameters + ---------- + self : object + The instance of the class. + ---------- + blocks : list + Input block is a list of raw blocks. + Schema can refer to the value of key ""preproc_blocks". + + Returns + ------- + result_dict : dict + result dictionary + """ + + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "para_blocks" in blocks.keys(): + input_blocks = blocks["para_blocks"] + for input_block in input_blocks: + new_block = self.__make_new_block(input_block) + para_blocks.append(new_block) + + blocks["para_blocks"] = para_blocks + + return pdf_dic + + +class DocStatisticsCalculator: + """ + This class calculates the statistics of the document. + """ + + def __init__(self) -> None: + pass + + def calc_stats_of_doc(self, pdf_dict): + """ + This function computes the statistics of the document + + Parameters + ---------- + result_dict : dict + result dictionary + + Returns + ------- + statistics : dict + statistics of the document + """ + + total_text_length = 0 + total_num_blocks = 0 + + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "para_blocks" in blocks.keys(): + para_blocks = blocks["para_blocks"] + for para_block in para_blocks: + total_text_length += len(para_block["text"]) + total_num_blocks += 1 + + avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0 + + font_list = [] + + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + if "para_blocks" in blocks.keys(): + input_blocks = blocks["para_blocks"] + for input_block in input_blocks: + block_text_length = len(input_block.get("text", "")) + if block_text_length < avg_text_length * 0.5: + continue + block_font_type = safe_get(input_block, "block_font_type", "") + block_font_size = safe_get(input_block, "block_font_size", 0) + font_list.append((block_font_type, block_font_size)) + + font_counter = Counter(font_list) + most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0) + second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0) + + statistics = { + "num_pages": 0, + "num_blocks": 0, + "num_paras": 0, + "num_titles": 0, + "num_header_blocks": 0, + "num_footer_blocks": 0, + "num_watermark_blocks": 0, + "num_vertical_margin_note_blocks": 0, + "most_common_font_type": most_common_font[0][0], + "most_common_font_size": most_common_font[0][1], + "number_of_most_common_font": most_common_font[1], + "second_most_common_font_type": second_most_common_font[0][0], + "second_most_common_font_size": second_most_common_font[0][1], + "number_of_second_most_common_font": second_most_common_font[1], + "avg_text_length": avg_text_length, + } + + for page_id, blocks in pdf_dict.items(): + if page_id.startswith("page_"): + blocks = pdf_dict[page_id]["para_blocks"] + statistics["num_pages"] += 1 + for block_id, block_data in enumerate(blocks): + statistics["num_blocks"] += 1 + + if "paras" in block_data.keys(): + statistics["num_paras"] += len(block_data["paras"]) + + for line in block_data["lines"]: + if line.get("is_title", 0): + statistics["num_titles"] += 1 + + if block_data.get("is_header", 0): + statistics["num_header_blocks"] += 1 + if block_data.get("is_footer", 0): + statistics["num_footer_blocks"] += 1 + if block_data.get("is_watermark", 0): + statistics["num_watermark_blocks"] += 1 + if block_data.get("is_vertical_margin_note", 0): + statistics["num_vertical_margin_note_blocks"] += 1 + + pdf_dict["statistics"] = statistics + + return pdf_dict + + +class TitleProcessor: + """ + This class processes the title. + """ + + def __init__(self, *doc_statistics) -> None: + if len(doc_statistics) > 0: + self.doc_statistics = doc_statistics[0] + + self.nlp_model = NLPModels() + self.MAX_TITLE_LEVEL = 3 + self.numbered_title_pattern = r""" + ^ # 行首 + ( # 开始捕获组 + [\(\(]\d+[\)\)] # 括号内数字,支持中文和英文括号,例如:(1) 或 (1) + |\d+[\)\)]\s # 数字后跟右括号和空格,支持中文和英文括号,例如:2) 或 2) + |[\(\(][A-Z][\)\)] # 括号内大写字母,支持中文和英文括号,例如:(A) 或 (A) + |[A-Z][\)\)]\s # 大写字母后跟右括号和空格,例如:A) 或 A) + |[\(\(][IVXLCDM]+[\)\)] # 括号内罗马数字,支持中文和英文括号,例如:(I) 或 (I) + |[IVXLCDM]+[\)\)]\s # 罗马数字后跟右括号和空格,例如:I) 或 I) + |\d+(\.\d+)*\s # 数字或复合数字编号后跟空格,例如:1. 或 3.2.1 + |[一二三四五六七八九十百千]+[、\s] # 中文序号后跟顿号和空格,例如:一、 + |[\(|\(][一二三四五六七八九十百千]+[\)|\)]\s* # 中文括号内中文序号后跟空格,例如:(一) + |[A-Z]\.\d+(\.\d+)?\s # 大写字母后跟点和数字,例如:A.1 或 A.1.1 + |[\(\(][a-z][\)\)] # 括号内小写字母,支持中文和英文括号,例如:(a) 或 (a) + |[a-z]\)\s # 小写字母后跟右括号和空格,例如:a) + |[A-Z]-\s # 大写字母后跟短横线和空格,例如:A- + |\w+:\s # 英文序号词后跟冒号和空格,例如:First: + |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格 + |[IVXLCDM]+\. # 罗马数字后跟点,例如:I. + |\d+\.\s # 单个数字后跟点和空格,例如:1. + ) # 结束捕获组 + .+ # 标题的其余部分 + """ + + def _is_potential_title( + self, + curr_line, + prev_line, + prev_line_is_title, + next_line, + avg_char_width, + avg_char_height, + median_font_size, + ): + """ + This function checks if the line is a potential title. + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + next_line : dict + next line + avg_char_width : float + average of char widths + avg_char_height : float + average of line heights + + Returns + ------- + bool + True if the line is a potential title, False otherwise. + """ + + def __is_line_centered(line_bbox, page_bbox, avg_char_width): + """ + This function checks if the line is centered on the page + + Parameters + ---------- + line_bbox : list + bbox of the line + page_bbox : list + bbox of the page + avg_char_width : float + average of char widths + + Returns + ------- + bool + True if the line is centered on the page, False otherwise. + """ + horizontal_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + + x0, _, x1, _ = line_bbox + _, _, page_x1, _ = page_bbox + + return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres + + def __is_bold_font_line(line): + """ + Check if a line contains any bold font style. + """ + + def _is_bold_span(span): + # if span text is empty or only contains space, return False + if not span["text"].strip(): + return False + + return bool(span["flags"] & 2**4) # Check if the font is bold + + for span in line["spans"]: + if not _is_bold_span(span): + return False + + return True + + def __is_italic_font_line(line): + """ + Check if a line contains any italic font style. + """ + + def __is_italic_span(span): + return bool(span["flags"] & 2**1) # Check if the font is italic + + for span in line["spans"]: + if not __is_italic_span(span): + return False + + return True + + def __is_punctuation_heavy(line_text): + """ + Check if the line contains a high ratio of punctuation marks, which may indicate + that the line is not a title. + + Parameters: + line_text (str): Text of the line. + + Returns: + bool: True if the line is heavy with punctuation, False otherwise. + """ + # Pattern for common title format like "X.Y. Title" + pattern = r"\b\d+\.\d+\..*\b" + + # If the line matches the title format, return False + if re.match(pattern, line_text.strip()): + return False + + # Find all punctuation marks in the line + punctuation_marks = re.findall(r"[^\w\s]", line_text) + number_of_punctuation_marks = len(punctuation_marks) + + text_length = len(line_text) + + if text_length == 0: + return False + + punctuation_ratio = number_of_punctuation_marks / text_length + if punctuation_ratio >= 0.1: + return True + + return False + + def __has_mixed_font_styles(spans, strict_mode=False): + """ + This function checks if the line has mixed font styles, the strict mode will compare the font types + + Parameters + ---------- + spans : list + spans of the line + strict_mode : bool + True for strict mode, the font types will be fully compared + False for non-strict mode, the font types will be compared by the most longest common prefix + + Returns + ------- + bool + True if the line has mixed font styles, False otherwise. + """ + if strict_mode: + font_styles = set() + for span in spans: + font_style = span["font"].lower() + font_styles.add(font_style) + + return len(font_styles) > 1 + + else: # non-strict mode + font_styles = [] + for span in spans: + font_style = span["font"].lower() + font_styles.append(font_style) + + if len(font_styles) > 1: + longest_common_prefix = os.path.commonprefix(font_styles) + if len(longest_common_prefix) > 0: + return False + else: + return True + else: + return False + + def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type): + """ + This function checks if the current line has a different font type from the previous and next lines + + Parameters + ---------- + curr_line_font_type : str + font type of the current line + prev_line_font_type : str + font type of the previous line + next_line_font_type : str + font type of the next line + + Returns + ------- + bool + True if the current line has a different font type from the previous and next lines, False otherwise. + """ + return all( + curr_line_font_type != other_font_type.lower() + for other_font_type in [prev_line_font_type, next_line_font_type] + if other_font_type is not None + ) + + def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size): + """ + This function checks if the current line has a larger font size than the previous and next lines + + Parameters + ---------- + curr_line_font_size : float + font size of the current line + prev_line_font_size : float + font size of the previous line + next_line_font_size : float + font size of the next line + + Returns + ------- + bool + True if the current line has a larger font size than the previous and next lines, False otherwise. + """ + return all( + curr_line_font_size > other_font_size * 1.2 + for other_font_size in [prev_line_font_size, next_line_font_size] + if other_font_size is not None + ) + + def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size): + """ + This function checks if the current line is similar to the previous line + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + + Returns + ------- + bool + True if the current line is similar to the previous line, False otherwise. + """ + + if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size: + return True + else: + return False + + def __is_same_font_type_of_docAvg(curr_line_font_type): + """ + This function checks if the current line has the same font type as the document average font type + + Parameters + ---------- + curr_line_font_type : str + font type of the current line + + Returns + ------- + bool + True if the current line has the same font type as the document average font type, False otherwise. + """ + doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower() + doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower() + + return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type] + + def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1): + """ + This function checks if the current line has a large enough font size + + Parameters + ---------- + curr_line_font_size : float + font size of the current line + ratio : float + ratio of the current line font size to the document average font size + + Returns + ------- + bool + True if the current line has a large enough font size, False otherwise. + """ + doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0) + doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0) + doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size) + + return curr_line_font_size >= doc_avg_font_size * ratio + + def __is_sufficient_spacing_above_and_below( + curr_line_bbox, + prev_line_bbox, + next_line_bbox, + avg_char_height, + median_font_size, + ): + """ + This function checks if the current line has sufficient spacing above and below + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + avg_char_height : float + average of line heights + + Returns + ------- + bool + True if the current line has sufficient spacing above and below, False otherwise. + """ + vertical_ratio = 1.25 + vertical_thres = vertical_ratio * median_font_size + + _, y0, _, y1 = curr_line_bbox + + sufficient_spacing_above = False + if prev_line_bbox: + vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3]) + sufficient_spacing_above = vertical_spacing_above > vertical_thres + else: + sufficient_spacing_above = True + + sufficient_spacing_below = False + if next_line_bbox: + vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1) + sufficient_spacing_below = vertical_spacing_below > vertical_thres + else: + sufficient_spacing_below = True + + return (sufficient_spacing_above, sufficient_spacing_below) + + def __is_word_list_line_by_rules(curr_line_text): + """ + This function checks if the current line is a word list + + Parameters + ---------- + curr_line_text : str + text of the current line + + Returns + ------- + bool + True if the current line is a name list, False otherwise. + """ + # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[,,;;\s]|$)" + name_list_pattern = r"(?= 0.9: + return True + + return False + + def __is_equation(line_text): + """ + This function checks if the current line is an equation. + + Parameters + ---------- + line_text : str + + Returns + ------- + bool + True if the current line is an equation, False otherwise. + """ + equation_reg = r"\$.*?\\overline.*?\$" # to match interline equations + + if re.search(equation_reg, line_text): + return True + else: + return False + + def __is_title_by_len(text, max_length=200): + """ + This function checks if the current line is a title by length. + + Parameters + ---------- + text : str + text of the current line + + max_length : int + max length of the title + + Returns + ------- + bool + True if the current line is a title, False otherwise. + + """ + text = text.strip() + return len(text) <= max_length + + def __compute_line_font_type_and_size(curr_line): + """ + This function computes the font type and font size of the line. + + Parameters + ---------- + line : dict + line + + Returns + ------- + font_type : str + font type of the line + font_size : float + font size of the line + """ + spans = curr_line["spans"] + max_accumulated_length = 0 + max_span_font_size = curr_line["spans"][0]["size"] # default value, float type + max_span_font_type = curr_line["spans"][0]["font"].lower() # default value, string type + for span in spans: + if span["text"].isspace(): + continue + span_length = span["bbox"][2] - span["bbox"][0] + if span_length > max_accumulated_length: + max_accumulated_length = span_length + max_span_font_size = span["size"] + max_span_font_type = span["font"].lower() + + return max_span_font_type, max_span_font_size + + def __is_a_consistent_sub_title(pre_line, curr_line): + """ + This function checks if the current line is a consistent sub title. + + Parameters + ---------- + pre_line : dict + previous line + curr_line : dict + current line + + Returns + ------- + bool + True if the current line is a consistent sub title, False otherwise. + """ + if pre_line is None: + return False + + start_letter_of_pre_line = pre_line["text"][0] + start_letter_of_curr_line = curr_line["text"][0] + + has_same_prefix_digit = ( + start_letter_of_pre_line.isdigit() + and start_letter_of_curr_line.isdigit() + and start_letter_of_pre_line == start_letter_of_curr_line + ) + + # prefix text of curr_line satisfies the following title format: x.x + prefix_text_pattern = r"^\d+\.\d+" + has_subtitle_format = re.match(prefix_text_pattern, curr_line["text"]) + + if has_same_prefix_digit or has_subtitle_format: + return True + + """ + Title detecting main Process. + """ + + """ + Basic features about the current line. + """ + curr_line_bbox = curr_line["bbox"] + curr_line_text = curr_line["text"] + curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line) + + if len(curr_line_text.strip()) == 0: # skip empty lines + return False, False + + prev_line_bbox = prev_line["bbox"] if prev_line else None + if prev_line: + prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line) + else: + prev_line_font_type, prev_line_font_size = None, None + + next_line_bbox = next_line["bbox"] if next_line else None + if next_line: + next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line) + else: + next_line_font_type, next_line_font_size = None, None + + """ + Aggregated features about the current line. + """ + is_italc_font = __is_italic_font_line(curr_line) + is_bold_font = __is_bold_font_line(curr_line) + + is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8) + is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1) + is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6) + + is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type) + + is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg + + is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True) + is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False) + + is_punctuation_heavy = __is_punctuation_heavy(curr_line_text) + + is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text) + is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"] + + is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors( + curr_line_font_size, prev_line_font_size, next_line_font_size + ) + + is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors( + curr_line_font_type, prev_line_font_type, next_line_font_type + ) + + has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below( + curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size + ) + + is_similar_to_pre_line = __is_similar_to_pre_line( + curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size + ) + + is_consis_sub_title = __is_a_consistent_sub_title(prev_line, curr_line) + + """ + Further aggregated features about the current line. + + Attention: + Features that start with __ are for internal use. + """ + + __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors( + curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width + ) + __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors + is_a_left_inline_title = ( + is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors + ) + + is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font + is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font + + is_title_by_check_pre_and_next_line = ( + (prev_line is not None or next_line is not None) + and has_sufficient_spaces_above + and has_sufficient_spaces_below + and is_potential_title_font + ) + + is_numbered_title = __is_numbered_title(curr_line_text) and ( + (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None) + ) + + is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text) + + is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text) + + is_equation = __is_equation(curr_line_text) + + is_title_by_len = __is_title_by_len(curr_line_text) + + """ + Decide if the line is a title. + """ + + is_title = ( + is_not_end_with_ending_puncs # not end with ending punctuation marks + and is_not_only_no_meaning_symbols # not only have no meaning symbols + and is_title_by_len # is a title by length, default max length is 200 + and not is_equation # an interline equation should never be a title + and is_potential_title_font # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type + and ( + (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg) + or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) + or ( + is_much_larger_font_than_doc_avg + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) + or ( + is_font_size_little_less_than_doc_avg + and is_bold_font + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) + ) # Consider the following situations: bold font, much larger font than doc avg, not same font type as doc avg, sufficient spacing above and below + and ( + ( + not is_person_or_org_list_line_by_nlp + and ( + is_much_larger_font_than_doc_avg + or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg) + ) + ) + or ( + not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp) + and not is_a_left_inline_title + and not is_punctuation_heavy + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) + or ( + is_person_or_org_list_line_by_nlp + and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) + and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg) + ) + or (is_numbered_title and not is_a_left_inline_title) + ) # Exclude the following situations: person/org list + ) + # ) or (prev_line_is_title and is_consis_sub_title) + + is_name_or_org_list_to_be_removed = ( + (is_person_or_org_list_line_by_nlp) + and is_punctuation_heavy + and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line) + ) and not is_title + + if is_name_or_org_list_to_be_removed: + is_author_or_org_list = True + else: + is_author_or_org_list = False + + # return is_title, is_author_or_org_list + + """ + # print reason why the line is a title + if is_title: + print_green("This line is a title.") + print_green("↓" * 10) + print() + print("curr_line_text: ", curr_line_text) + print() + + # print reason why the line is not a title + line_text = curr_line_text.strip() + test_text = "Career/Personal Life" + text_content_condition = line_text == test_text + + if not is_title and text_content_condition: # Print specific line + # if not is_title: # Print each line + print_red("This line is not a title.") + print_red("↓" * 10) + + print() + print("curr_line_text: ", curr_line_text) + print() + + if is_not_end_with_ending_puncs: + print_green(f"is_not_end_with_ending_puncs") + else: + print_red(f"is_end_with_ending_puncs") + + if is_not_only_no_meaning_symbols: + print_green(f"is_not_only_no_meaning_symbols") + else: + print_red(f"is_only_no_meaning_symbols") + + if is_title_by_len: + print_green(f"is_title_by_len: {is_title_by_len}") + else: + print_red(f"is_not_title_by_len: {is_title_by_len}") + + if is_equation: + print_red(f"is_equation") + else: + print_green(f"is_not_equation") + + if is_potential_title_font: + print_green(f"is_potential_title_font") + else: + print_red(f"is_not_potential_title_font") + + if is_punctuation_heavy: + print_red("is_punctuation_heavy") + else: + print_green("is_not_punctuation_heavy") + + if is_bold_font: + print_green(f"is_bold_font") + else: + print_red(f"is_not_bold_font") + + if is_font_size_not_less_than_doc_avg: + print_green(f"is_larger_font_than_doc_avg") + else: + print_red(f"is_not_larger_font_than_doc_avg") + + if is_much_larger_font_than_doc_avg: + print_green(f"is_much_larger_font_than_doc_avg") + else: + print_red(f"is_not_much_larger_font_than_doc_avg") + + if is_not_same_font_type_of_docAvg: + print_green(f"is_not_same_font_type_of_docAvg") + else: + print_red(f"is_same_font_type_of_docAvg") + + if is_word_list_line_by_rules: + print_red("is_word_list_line_by_rules") + else: + print_green("is_not_name_list_by_rules") + + if is_person_or_org_list_line_by_nlp: + print_red("is_person_or_org_list_line_by_nlp") + else: + print_green("is_not_person_or_org_list_line_by_nlp") + + if not is_numbered_title: + print_red("is_not_numbered_title") + else: + print_green("is_numbered_title") + + if is_a_left_inline_title: + print_red("is_a_left_inline_title") + else: + print_green("is_not_a_left_inline_title") + + if not is_title_by_check_prev_line: + print_red("is_not_title_by_check_prev_line") + else: + print_green("is_title_by_check_prev_line") + + if not is_title_by_check_next_line: + print_red("is_not_title_by_check_next_line") + else: + print_green("is_title_by_check_next_line") + + if not is_title_by_check_pre_and_next_line: + print_red("is_not_title_by_check_pre_and_next_line") + else: + print_green("is_title_by_check_pre_and_next_line") + + # print_green("Common features:") + # print_green("↓" * 10) + + # print(f" curr_line_font_type: {curr_line_font_type}") + # print(f" curr_line_font_size: {curr_line_font_size}") + # print() + + """ + + return is_title, is_author_or_org_list + + def _detect_title(self, input_block): + """ + Use the functions 'is_potential_title' to detect titles of each paragraph block. + If a line is a title, then the value of key 'is_title' of the line will be set to True. + """ + + raw_lines = input_block["lines"] + + prev_line_is_title_flag = False + + for i, curr_line in enumerate(raw_lines): + prev_line = raw_lines[i - 1] if i > 0 else None + next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None + + blk_avg_char_width = input_block["avg_char_width"] + blk_avg_char_height = input_block["avg_char_height"] + blk_media_font_size = input_block["median_font_size"] + + is_title, is_author_or_org_list = self._is_potential_title( + curr_line, + prev_line, + prev_line_is_title_flag, + next_line, + blk_avg_char_width, + blk_avg_char_height, + blk_media_font_size, + ) + + if is_title: + curr_line["is_title"] = is_title + prev_line_is_title_flag = True + else: + curr_line["is_title"] = False + prev_line_is_title_flag = False + + # print(f"curr_line['text']: {curr_line['text']}") + # print(f"curr_line['is_title']: {curr_line['is_title']}") + # print(f"prev_line['text']: {prev_line['text'] if prev_line else None}") + # print(f"prev_line_is_title_flag: {prev_line_is_title_flag}") + # print() + + if is_author_or_org_list: + curr_line["is_author_or_org_list"] = is_author_or_org_list + else: + curr_line["is_author_or_org_list"] = False + + return input_block + + def batch_detect_titles(self, pdf_dic): + """ + This function batch process the blocks to detect titles. + + Parameters + ---------- + pdf_dict : dict + result dictionary + + Returns + ------- + pdf_dict : dict + result dictionary + """ + num_titles = 0 + + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "para_blocks" in blocks.keys(): + para_blocks = blocks["para_blocks"] + + all_single_line_blocks = [] + for block in para_blocks: + if len(block["lines"]) == 1: + all_single_line_blocks.append(block) + + new_para_blocks = [] + if not len(all_single_line_blocks) == len(para_blocks): # Not all blocks are single line blocks. + for para_block in para_blocks: + new_block = self._detect_title(para_block) + new_para_blocks.append(new_block) + num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]]) + else: # All blocks are single line blocks. + for para_block in para_blocks: + new_para_blocks.append(para_block) + num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]]) + para_blocks = new_para_blocks + + blocks["para_blocks"] = para_blocks + + for para_block in para_blocks: + all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"]) + para_text_len = sum([len(line["text"]) for line in para_block["lines"]]) + if ( + all_titles and para_text_len < 200 + ): # total length of the paragraph is less than 200, more than this should not be a title + para_block["is_block_title"] = 1 + else: + para_block["is_block_title"] = 0 + + all_name_or_org_list_to_be_removed = all( + safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"] + ) + if all_name_or_org_list_to_be_removed and page_id == "page_0": + para_block["is_block_an_author_or_org_list"] = 1 + else: + para_block["is_block_an_author_or_org_list"] = 0 + + pdf_dic["statistics"]["num_titles"] = num_titles + + return pdf_dic + + def _recog_title_level(self, title_blocks): + """ + This function determines the title level based on the font size of the title. + + Parameters + ---------- + title_blocks : list + + Returns + ------- + title_blocks : list + """ + + font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks]) + + # Use the mean and std of font sizes to remove extreme values + mean_font_size = np.mean(font_sizes) + std_font_size = np.std(font_sizes) + min_extreme_font_size = mean_font_size - std_font_size # type: ignore + max_extreme_font_size = mean_font_size + std_font_size # type: ignore + + # Compute the threshold for title level + middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)] + if middle_font_sizes.size > 0: + middle_mean_font_size = np.mean(middle_font_sizes) + level_threshold = middle_mean_font_size + else: + level_threshold = mean_font_size + + for tb in title_blocks: + title_block = tb["block"] + title_font_size = safe_get(title_block, "block_font_size", 0) + + current_level = 1 # Initialize title level, the biggest level is 1 + + # print(f"Before adjustment by font size, {current_level}") + if title_font_size >= max_extreme_font_size: + current_level = 1 + elif title_font_size <= min_extreme_font_size: + current_level = 3 + elif float(title_font_size) >= float(level_threshold): + current_level = 2 + else: + current_level = 3 + # print(f"After adjustment by font size, {current_level}") + + title_block["block_title_level"] = current_level + + return title_blocks + + def batch_recog_title_level(self, pdf_dic): + """ + This function batch process the blocks to recognize title level. + + Parameters + ---------- + pdf_dict : dict + result dictionary + + Returns + ------- + pdf_dict : dict + result dictionary + """ + title_blocks = [] + + # Collect all titles + for page_id, blocks in pdf_dic.items(): + if page_id.startswith("page_"): + para_blocks = blocks.get("para_blocks", []) + for block in para_blocks: + if block.get("is_block_title"): + title_obj = {"page_id": page_id, "block": block} + title_blocks.append(title_obj) + + # Determine title level + if title_blocks: + # Determine title level based on font size + title_blocks = self._recog_title_level(title_blocks) + + return pdf_dic + + +class BlockTerminationProcessor: + """ + This class is used to process the block termination. + """ + + def __init__(self) -> None: + pass + + def _is_consistent_lines( + self, + curr_line, + prev_line, + next_line, + consistent_direction, # 0 for prev, 1 for next, 2 for both + ): + """ + This function checks if the line is consistent with its neighbors + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + next_line : dict + next line + consistent_direction : int + 0 for prev, 1 for next, 2 for both + + Returns + ------- + bool + True if the line is consistent with its neighbors, False otherwise. + """ + + curr_line_font_size = curr_line["spans"][0]["size"] + curr_line_font_type = curr_line["spans"][0]["font"].lower() + + if consistent_direction == 0: + if prev_line: + prev_line_font_size = prev_line["spans"][0]["size"] + prev_line_font_type = prev_line["spans"][0]["font"].lower() + return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type + else: + return False + + elif consistent_direction == 1: + if next_line: + next_line_font_size = next_line["spans"][0]["size"] + next_line_font_type = next_line["spans"][0]["font"].lower() + return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type + else: + return False + + elif consistent_direction == 2: + if prev_line and next_line: + prev_line_font_size = prev_line["spans"][0]["size"] + prev_line_font_type = prev_line["spans"][0]["font"].lower() + next_line_font_size = next_line["spans"][0]["size"] + next_line_font_type = next_line["spans"][0]["font"].lower() + return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and ( + curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type + ) + else: + return False + + else: + return False + + def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height): + """ + This function checks if the line is a regular line + + Parameters + ---------- + curr_line_bbox : list + bbox of the current line + prev_line_bbox : list + bbox of the previous line + next_line_bbox : list + bbox of the next line + avg_char_width : float + average of char widths + X0 : float + median of x0 values, which represents the left average boundary of the page + X1 : float + median of x1 values, which represents the right average boundary of the page + avg_line_height : float + average of line heights + + Returns + ------- + bool + True if the line is a regular line, False otherwise. + """ + horizontal_ratio = 0.5 + vertical_ratio = 0.5 + horizontal_thres = horizontal_ratio * avg_char_width + vertical_thres = vertical_ratio * avg_line_height + + x0, y0, x1, y1 = curr_line_bbox + + x0_near_X0 = abs(x0 - X0) < horizontal_thres + x1_near_X1 = abs(x1 - X1) < horizontal_thres + + prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width) + + sufficient_spacing_above = False + if prev_line_bbox: + vertical_spacing_above = y1 - prev_line_bbox[3] + sufficient_spacing_above = vertical_spacing_above > vertical_thres + + sufficient_spacing_below = False + if next_line_bbox: + vertical_spacing_below = next_line_bbox[1] - y0 + sufficient_spacing_below = vertical_spacing_below > vertical_thres + + return ( + (sufficient_spacing_above or sufficient_spacing_below) + or (not x0_near_X0 and not x1_near_X1) + or prev_line_is_end_of_para + ) + + def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size): + """ + This function checks if the line is a possible start of a paragraph + + Parameters + ---------- + curr_line : dict + current line + prev_line : dict + previous line + next_line : dict + next line + X0 : float + median of x0 values, which represents the left average boundary of the page + X1 : float + median of x1 values, which represents the right average boundary of the page + avg_char_width : float + average of char widths + avg_line_height : float + average of line heights + + Returns + ------- + bool + True if the line is a possible start of a paragraph, False otherwise. + """ + start_confidence = 0.5 # Initial confidence of the line being a start of a paragraph + decision_path = [] # Record the decision path + + curr_line_bbox = curr_line["bbox"] + prev_line_bbox = prev_line["bbox"] if prev_line else None + next_line_bbox = next_line["bbox"] if next_line else None + + indent_ratio = 1 + + vertical_ratio = 1.5 + vertical_thres = vertical_ratio * avg_font_size + + left_horizontal_ratio = 0.5 + left_horizontal_thres = left_horizontal_ratio * avg_char_width + + right_horizontal_ratio = 2.5 + right_horizontal_thres = right_horizontal_ratio * avg_char_width + + x0, y0, x1, y1 = curr_line_bbox + + indent_condition = x0 > X0 + indent_ratio * avg_char_width + if indent_condition: + start_confidence += 0.2 + decision_path.append("indent_condition_met") + + x0_near_X0 = abs(x0 - X0) < left_horizontal_thres + if x0_near_X0: + start_confidence += 0.1 + decision_path.append("x0_near_X0") + + x1_near_X1 = abs(x1 - X1) < right_horizontal_thres + if x1_near_X1: + start_confidence += 0.1 + decision_path.append("x1_near_X1") + + if prev_line is None: + prev_line_is_end_of_para = True + start_confidence += 0.2 + decision_path.append("no_prev_line") + else: + prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width) + if prev_line_is_end_of_para: + start_confidence += 0.1 + decision_path.append("prev_line_is_end_of_para") + + sufficient_spacing_above = False + if prev_line_bbox: + vertical_spacing_above = y1 - prev_line_bbox[3] + sufficient_spacing_above = vertical_spacing_above > vertical_thres + if sufficient_spacing_above: + start_confidence += 0.2 + decision_path.append("sufficient_spacing_above") + + sufficient_spacing_below = False + if next_line_bbox: + vertical_spacing_below = next_line_bbox[1] - y0 + sufficient_spacing_below = vertical_spacing_below > vertical_thres + if sufficient_spacing_below: + start_confidence += 0.2 + decision_path.append("sufficient_spacing_below") + + is_regular_line = self._is_regular_line( + curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size + ) + if is_regular_line: + start_confidence += 0.1 + decision_path.append("is_regular_line") + + is_start_of_para = ( + (sufficient_spacing_above or sufficient_spacing_below) + or (indent_condition) + or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line) + or prev_line_is_end_of_para + ) + return (is_start_of_para, start_confidence, decision_path) + + def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width): + """ + This function checks if the line is a possible end of a paragraph + + Parameters + ---------- + curr_line : dict + current line + next_line : dict + next line + X0 : float + median of x0 values, which represents the left average boundary of the page + X1 : float + median of x1 values, which represents the right average boundary of the page + avg_char_width : float + average of char widths + + Returns + ------- + bool + True if the line is a possible end of a paragraph, False otherwise. + """ + + end_confidence = 0.5 # Initial confidence of the line being a end of a paragraph + decision_path = [] # Record the decision path + + curr_line_bbox = curr_line["bbox"] + next_line_bbox = next_line["bbox"] if next_line else None + + left_horizontal_ratio = 0.5 + right_horizontal_ratio = 0.5 + + x0, _, x1, y1 = curr_line_bbox + next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0) + + x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width + if x0_near_X0: + end_confidence += 0.1 + decision_path.append("x0_near_X0") + + x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width + if x1_smaller_than_X1: + end_confidence += 0.1 + decision_path.append("x1_smaller_than_X1") + + next_line_is_start_of_para = ( + next_line_bbox + and (next_x0 > X0 + left_horizontal_ratio * avg_char_width) + and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1)) + ) + if next_line_is_start_of_para: + end_confidence += 0.2 + decision_path.append("next_line_is_start_of_para") + + is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors( + curr_line_bbox, None, next_line_bbox, avg_char_width + ) + if is_line_left_aligned_from_neighbors_bool: + end_confidence += 0.1 + decision_path.append("line_is_left_aligned_from_neighbors") + + is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors( + curr_line_bbox, None, next_line_bbox, avg_char_width + ) + if not is_line_right_aligned_from_neighbors_bool: + end_confidence += 0.1 + decision_path.append("line_is_not_right_aligned_from_neighbors") + + is_end_of_para = end_with_punctuation(curr_line["text"]) and ( + (x0_near_X0 and x1_smaller_than_X1) + or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool) + ) + + return (is_end_of_para, end_confidence, decision_path) + + def _cut_paras_per_block( + self, + block, + ): + """ + Processes a raw block from PyMuPDF and returns the processed block. + + Parameters + ---------- + raw_block : dict + A raw block from pymupdf. + + Returns + ------- + processed_block : dict + + """ + + def _construct_para(lines, is_block_title, para_title_level): + """ + Construct a paragraph from given lines. + """ + + font_sizes = [span["size"] for line in lines for span in line["spans"]] + avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0 + + font_colors = [span["color"] for line in lines for span in line["spans"]] + most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None + + font_type_lengths = {} + for line in lines: + for span in line["spans"]: + font_type = span["font"] + bbox_width = span["bbox"][2] - span["bbox"][0] + if font_type in font_type_lengths: + font_type_lengths[font_type] += bbox_width + else: + font_type_lengths[font_type] = bbox_width + + # get the font type with the longest bbox width + most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None # type: ignore + + para_bbox = calculate_para_bbox(lines) + para_text = " ".join(line["text"] for line in lines) + + return { + "para_bbox": para_bbox, + "para_text": para_text, + "para_font_type": most_common_font_type, + "para_font_size": avg_font_size, + "para_font_color": most_common_font_color, + "is_para_title": is_block_title, + "para_title_level": para_title_level, + } + + block_bbox = block["bbox"] + block_text = block["text"] + block_lines = block["lines"] + + X0 = safe_get(block, "X0", 0) + X1 = safe_get(block, "X1", 0) + avg_char_width = safe_get(block, "avg_char_width", 0) + avg_char_height = safe_get(block, "avg_char_height", 0) + avg_font_size = safe_get(block, "avg_font_size", 0) + + is_block_title = safe_get(block, "is_block_title", False) + para_title_level = safe_get(block, "block_title_level", 0) + + # Segment into paragraphs + para_ranges = [] + in_paragraph = False + start_idx_of_para = None + + # Create the processed paragraphs + processed_paras = {} + para_bboxes = [] + end_idx_of_para = 0 + + for line_index, line in enumerate(block_lines): + curr_line = line + prev_line = block_lines[line_index - 1] if line_index > 0 else None + next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None + + """ + Start processing paragraphs. + """ + + # Check if the line is the start of a paragraph + is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para( + curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size + ) + if not in_paragraph and is_start_of_para: + in_paragraph = True + start_idx_of_para = line_index + + # print_green(">>> Start of a paragraph") + # print(" curr_line_text: ", curr_line["text"]) + # print(" start_confidence: ", start_confidence) + # print(" decision_path: ", decision_path) + + # Check if the line is the end of a paragraph + is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para( + curr_line, next_line, X0, X1, avg_char_width + ) + if in_paragraph and (is_end_of_para or not next_line): + para_ranges.append((start_idx_of_para, line_index)) + start_idx_of_para = None + in_paragraph = False + + # print_red(">>> End of a paragraph") + # print(" curr_line_text: ", curr_line["text"]) + # print(" end_confidence: ", end_confidence) + # print(" decision_path: ", decision_path) + + # Add the last paragraph if it is not added + if in_paragraph and start_idx_of_para is not None: + para_ranges.append((start_idx_of_para, len(block_lines) - 1)) + + # Process the matched paragraphs + for para_index, (start_idx, end_idx) in enumerate(para_ranges): + matched_lines = block_lines[start_idx : end_idx + 1] + para_properties = _construct_para(matched_lines, is_block_title, para_title_level) + para_key = f"para_{len(processed_paras)}" + processed_paras[para_key] = para_properties + para_bboxes.append(para_properties["para_bbox"]) + end_idx_of_para = end_idx + 1 + + # Deal with the remaining lines + if end_idx_of_para < len(block_lines): + unmatched_lines = block_lines[end_idx_of_para:] + unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level) + unmatched_key = f"para_{len(processed_paras)}" + processed_paras[unmatched_key] = unmatched_properties + para_bboxes.append(unmatched_properties["para_bbox"]) + + block["paras"] = processed_paras + + return block + + def batch_process_blocks(self, pdf_dict): + """ + Parses the blocks of all pages. + + Parameters + ---------- + pdf_dict : dict + PDF dictionary. + filter_blocks : list + List of bounding boxes to filter. + + Returns + ------- + result_dict : dict + Result dictionary. + + """ + + num_paras = 0 + + for page_id, page in pdf_dict.items(): + if page_id.startswith("page_"): + para_blocks = [] + if "para_blocks" in page.keys(): + input_blocks = page["para_blocks"] + for input_block in input_blocks: + new_block = self._cut_paras_per_block(input_block) + para_blocks.append(new_block) + num_paras += len(new_block["paras"]) + + page["para_blocks"] = para_blocks + + pdf_dict["statistics"]["num_paras"] = num_paras + return pdf_dict + + +class BlockContinuationProcessor: + """ + This class is used to process the blocks to detect block continuations. + """ + + def __init__(self) -> None: + pass + + def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3): + """ + This function checks if the two font types are similar. + Definition of similar font types: the two font types have a common prefix, + and the length of the common prefix is at least a certain ratio of the length of the shorter font type. + + Parameters + ---------- + font_type1 : str + font type 1 + font_type2 : str + font type 2 + prefix_length_ratio : float + minimum ratio of the common prefix length to the length of the shorter font type + + Returns + ------- + bool + True if the two font types are similar, False otherwise. + """ + + if isinstance(font_type_1, list): + font_type_1 = font_type_1[0] if font_type_1 else "" + if isinstance(font_type_2, list): + font_type_2 = font_type_2[0] if font_type_2 else "" + + if font_type_1 == font_type_2: + return True + + # Find the length of the common prefix + common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2])) + + # Calculate the minimum prefix length based on the ratio + min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio) + + return common_prefix_length >= min_prefix_length + + def __is_same_block_font(self, block_1, block_2): + """ + This function compares the font of block1 and block2 + + Parameters + ---------- + block1 : dict + block1 + block2 : dict + block2 + + Returns + ------- + is_same : bool + True if block1 and block2 have the same font, else False + """ + block_1_font_type = safe_get(block_1, "block_font_type", "") + block_1_font_size = safe_get(block_1, "block_font_size", 0) + block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0) + + block_2_font_type = safe_get(block_2, "block_font_type", "") + block_2_font_size = safe_get(block_2, "block_font_size", 0) + block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0) + + if isinstance(block_1_font_size, list): + block_1_font_size = block_1_font_size[0] if block_1_font_size else 0 + if isinstance(block_2_font_size, list): + block_2_font_size = block_2_font_size[0] if block_2_font_size else 0 + + block_1_text = safe_get(block_1, "text", "") + block_2_text = safe_get(block_2, "text", "") + + if block_1_avg_char_width == 0 or block_2_avg_char_width == 0: + return False + + if not block_1_text or not block_2_text: + return False + else: + text_len_ratio = len(block_2_text) / len(block_1_text) + if text_len_ratio < 0.2: + avg_char_width_condition = ( + abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width) + < 0.5 + ) + else: + avg_char_width_condition = ( + abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width) + < 0.2 + ) + + block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1 + + return ( + self.__is_similar_font_type(block_1_font_type, block_2_font_type) + and avg_char_width_condition + and block_font_size_condition + ) + + def _is_alphabet_char(self, char): + if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"): + return True + else: + return False + + def _is_chinese_char(self, char): + if char >= "\u4e00" and char <= "\u9fa5": + return True + else: + return False + + def _is_other_letter_char(self, char): + try: + cat = unicodedata.category(char) + if cat == "Lu" or cat == "Ll": + return not self._is_alphabet_char(char) and not self._is_chinese_char(char) + except TypeError: + print("The input to the function must be a single character.") + return False + + def _is_year(self, s: str): + try: + number = int(s) + return 1900 <= number <= 2099 + except ValueError: + return False + + def _match_brackets(self, text): + # pattern = r"^[\(\)\[\]()【】{}{}<><>〔〕〘〙\"\'“”‘’]" + pattern = r"^[\(\)\]()】{}{}>>〕〙\"\'“”‘’]" + return bool(re.match(pattern, text)) + + def _is_para_font_consistent(self, para_1, para_2): + """ + This function compares the font of para1 and para2 + + Parameters + ---------- + para1 : dict + para1 + para2 : dict + para2 + + Returns + ------- + is_same : bool + True if para1 and para2 have the same font, else False + """ + if para_1 is None or para_2 is None: + return False + + para_1_font_type = safe_get(para_1, "para_font_type", "") + para_1_font_size = safe_get(para_1, "para_font_size", 0) + para_1_font_color = safe_get(para_1, "para_font_color", "") + + para_2_font_type = safe_get(para_2, "para_font_type", "") + para_2_font_size = safe_get(para_2, "para_font_size", 0) + para_2_font_color = safe_get(para_2, "para_font_color", "") + + if isinstance(para_1_font_type, list): # get the most common font type + para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count) + if isinstance(para_2_font_type, list): + para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count) + if isinstance(para_1_font_size, list): # compute average font type + para_1_font_size = sum(para_1_font_size) / len(para_1_font_size) + if isinstance(para_2_font_size, list): # compute average font type + para_2_font_size = sum(para_2_font_size) / len(para_2_font_size) + + return ( + self.__is_similar_font_type(para_1_font_type, para_2_font_type) + and abs(para_1_font_size - para_2_font_size) < 1.5 + # and para_font_color1 == para_font_color2 + ) + + def _is_para_puncs_consistent(self, para_1, para_2): + """ + This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter) + + Parameters + ---------- + para1 : dict + para1 + para2 : dict + para2 + + Returns + ------- + is_same : bool + True if para1 and para2 are from the same paragraph by using the puncs, else False + """ + para_1_text = safe_get(para_1, "para_text", "").strip() + para_2_text = safe_get(para_2, "para_text", "").strip() + + para_1_bboxes = safe_get(para_1, "para_bbox", []) + para_1_font_sizes = safe_get(para_1, "para_font_size", 0) + + para_2_bboxes = safe_get(para_2, "para_bbox", []) + para_2_font_sizes = safe_get(para_2, "para_font_size", 0) + + # print_yellow(" Features of determine puncs_consistent:") + # print(f" para_1_text: {para_1_text}") + # print(f" para_2_text: {para_2_text}") + # print(f" para_1_bboxes: {para_1_bboxes}") + # print(f" para_2_bboxes: {para_2_bboxes}") + # print(f" para_1_font_sizes: {para_1_font_sizes}") + # print(f" para_2_font_sizes: {para_2_font_sizes}") + + if is_nested_list(para_1_bboxes): + x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1] + else: + x0_1, y0_1, x1_1, y1_1 = para_1_bboxes + + if is_nested_list(para_2_bboxes): + x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0] + para_2_font_sizes = para_2_font_sizes[0] # type: ignore + else: + x0_2, y0_2, x1_2, y1_2 = para_2_bboxes + + right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8 + are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold + + left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8 + is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold + is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold + + # Check if either para_text1 or para_text2 is empty + if not para_1_text or not para_2_text: + return False + + # Define the end puncs for a sentence to end and hyphen + end_puncs = [".", "?", "!", "。", "?", "!", "…"] + hyphen = ["-", "—"] + + # Check if para_text1 ends with either hyphen or non-end punctuation or spaces + para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen + para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs + para_1_end_with_space = para_1_text and para_1_text[-1] == " " + para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs + + # print_yellow(f" para_1_end_with_hyphen: {para_1_end_with_hyphen}") + # print_yellow(f" para_1_end_with_end_punc: {para_1_end_with_end_punc}") + # print_yellow(f" para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}") + # print_yellow(f" para_1_end_with_space: {para_1_end_with_space}") + + if para_1_end_with_hyphen: # If para_text1 ends with hyphen + # print_red(f"para_1 is end with hyphen.") + para_2_is_consistent = para_2_text and ( + para_2_text[0] in hyphen + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower()) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + # print(f"para_2 is not consistent.\n") + pass + + elif para_1_end_with_end_punc: # If para_text1 ends with ending punctuations + # print_red(f"para_1 is end with end_punc.") + para_2_is_consistent = ( + para_2_text + and ( + para_2_text[0] + == " " + # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper()) + # or (self._is_chinese_char(para_2_text[0])) + # or (self._is_other_letter_char(para_2_text[0])) + ) + and not is_para2_left_indent_than_papa1 + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + # print(f"para_2 is not consistent.\n") + pass + + elif para_1_not_end_with_end_punc: # If para_text1 is not end with ending punctuations + # print_red(f"para_1 is NOT end with end_punc.") + para_2_is_consistent = para_2_text and ( + para_2_text[0] == " " + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower()) + or (self._is_alphabet_char(para_2_text[0])) + or (self._is_year(para_2_text[0:4])) + or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + or (self._match_brackets(para_2_text[0])) + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + # print(f"para_2 is not consistent.\n") + pass + + elif para_1_end_with_space: # If para_text1 ends with space + # print_red(f"para_1 is end with space.") + para_2_is_consistent = para_2_text and ( + para_2_text[0] == " " + or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower()) + or (self._is_chinese_char(para_2_text[0])) + or (self._is_other_letter_char(para_2_text[0])) + ) + if para_2_is_consistent: + # print(f"para_2 is consistent.\n") + return True + else: + pass + # print(f"para_2 is not consistent.\n") + + return False + + def _is_block_consistent(self, block_1, block_2): + """ + This function determines whether block1 and block2 are originally from the same block + + Parameters + ---------- + block1 : dict + block1s + block2 : dict + block2 + + Returns + ------- + is_same : bool + True if block1 and block2 are from the same block, else False + """ + return self.__is_same_block_font(block_1, block_2) + + def _is_para_continued(self, para_1, para_2): + """ + This function determines whether para1 and para2 are originally from the same paragraph + + Parameters + ---------- + para1 : dict + para1 + para2 : dict + para2 + + Returns + ------- + is_same : bool + True if para1 and para2 are from the same paragraph, else False + """ + is_para_font_consistent = self._is_para_font_consistent(para_1, para_2) + is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2) + + return is_para_font_consistent and is_para_puncs_consistent + + def _are_boundaries_of_block_consistent(self, block_1, block_2): + """ + This function checks if the boundaries of block1 and block2 are consistent + + Parameters + ---------- + block1 : dict + block1 + + block2 : dict + block2 + + Returns + ------- + is_consistent : bool + True if the boundaries of block1 and block2 are consistent, else False + """ + + last_line_of_block_1 = block_1["lines"][-1] + first_line_of_block_2 = block_2["lines"][0] + + spans_of_last_line_of_block_1 = last_line_of_block_1["spans"] + spans_of_first_line_of_block_2 = first_line_of_block_2["spans"] + + font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower() + font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"] + font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"] + font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"] + + font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower() + font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"] + font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"] + font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"] + + return ( + self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2) + and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1 + # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2 + and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2 + ) + + def should_merge_next_para(self, curr_para, next_para): + """ + This function checks if the next_para should be merged into the curr_para. + + Parameters + ---------- + curr_para : dict + The current paragraph. + next_para : dict + The next paragraph. + + Returns + ------- + bool + True if the next_para should be merged into the curr_para, False otherwise. + """ + if self._is_para_continued(curr_para, next_para): + return True + else: + return False + + def batch_tag_paras(self, pdf_dict): + """ + This function tags the paragraphs in the pdf_dict. + + Parameters + ---------- + pdf_dict : dict + PDF dictionary. + + Returns + ------- + pdf_dict : dict + PDF dictionary with tagged paragraphs. + """ + the_last_page_id = len(pdf_dict) - 1 + + for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()): + if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []): + para_blocks_of_curr_page = curr_page_content["para_blocks"] + next_page_idx = curr_page_idx + 1 + next_page_id = f"page_{next_page_idx}" + next_page_content = pdf_dict.get(next_page_id, {}) + + for i, current_block in enumerate(para_blocks_of_curr_page): + for para_id, curr_para in current_block["paras"].items(): + curr_para["curr_para_location"] = [ + curr_page_idx, + current_block["block_id"], + int(para_id.split("_")[-1]), + ] + curr_para["next_para_location"] = None # 默认设置为None + curr_para["merge_next_para"] = False # 默认设置为False + + next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None + + if next_block: + curr_block_last_para_key = list(current_block["paras"].keys())[-1] + curr_blk_last_para = current_block["paras"][curr_block_last_para_key] + + next_block_first_para_key = list(next_block["paras"].keys())[0] + next_blk_first_para = next_block["paras"][next_block_first_para_key] + + if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para): + curr_blk_last_para["next_para_location"] = [ + curr_page_idx, + next_block["block_id"], + int(next_block_first_para_key.split("_")[-1]), + ] + curr_blk_last_para["merge_next_para"] = True + else: + # Handle the case where the next block is in a different page + curr_block_last_para_key = list(current_block["paras"].keys())[-1] + curr_blk_last_para = current_block["paras"][curr_block_last_para_key] + + while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id: + next_page_idx += 1 + next_page_id = f"page_{next_page_idx}" + next_page_content = pdf_dict.get(next_page_id, {}) + + if next_page_content.get("para_blocks", []): + next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0] + next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key] + + if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para): + curr_blk_last_para["next_para_location"] = [ + next_page_idx, + next_page_content["para_blocks"][0]["block_id"], + int(next_blk_first_para_key.split("_")[-1]), + ] + curr_blk_last_para["merge_next_para"] = True + + return pdf_dict + + def find_block_by_id(self, para_blocks, block_id): + """ + This function finds a block by its id. + + Parameters + ---------- + para_blocks : list + List of blocks. + block_id : int + Id of the block to find. + + Returns + ------- + block : dict + The block with the given id. + """ + for blk_idx, block in enumerate(para_blocks): + if block.get("block_id") == block_id: + return block + return None + + def batch_merge_paras(self, pdf_dict): + """ + This function merges the paragraphs in the pdf_dict. + + Parameters + ---------- + pdf_dict : dict + PDF dictionary. + + Returns + ------- + pdf_dict : dict + PDF dictionary with merged paragraphs. + """ + for page_id, page_content in pdf_dict.items(): + if page_id.startswith("page_") and page_content.get("para_blocks", []): + para_blocks_of_page = page_content["para_blocks"] + + for i in range(len(para_blocks_of_page)): + current_block = para_blocks_of_page[i] + paras = current_block["paras"] + + for para_id, curr_para in list(paras.items()): + # print(f"current para_id: {para_id}") + # 跳过标题段落 + if curr_para.get("is_para_title"): + continue + + while curr_para.get("merge_next_para"): + curr_para_location = curr_para.get("curr_para_location") + next_para_location = curr_para.get("next_para_location") + + # print(f"curr_para_location: {curr_para_location}, next_para_location: {next_para_location}") + + if not next_para_location: + break + + if curr_para_location == next_para_location: + # print_red("The next para is in the same block as the current para.") + curr_para["merge_next_para"] = False + break + + next_page_idx, next_block_id, next_para_id = next_para_location + next_page_id = f"page_{next_page_idx}" + next_page_content = pdf_dict.get(next_page_id) + if not next_page_content: + break + + next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id) + + if not next_block: + break + + next_para = next_block["paras"].get(f"para_{next_para_id}") + + if not next_para or next_para.get("is_para_title"): + break + + # 合并段落文本 + curr_para_text = curr_para.get("para_text", "") + next_para_text = next_para.get("para_text", "") + curr_para["para_text"] = curr_para_text + " " + next_para_text + + # 更新 next_para_location + curr_para["next_para_location"] = next_para.get("next_para_location") + + # 将下一个段落文本置为空,表示已被合并 + next_para["para_text"] = "" + + # 更新 merge_next_para 标记 + curr_para["merge_next_para"] = next_para.get("merge_next_para", False) + + return pdf_dict + + +class DrawAnnos: + """ + This class draws annotations on the pdf file + + ---------------------------------------- + Color Code + ---------------------------------------- + Red: (1, 0, 0) + Green: (0, 1, 0) + Blue: (0, 0, 1) + Yellow: (1, 1, 0) - mix of red and green + Cyan: (0, 1, 1) - mix of green and blue + Magenta: (1, 0, 1) - mix of red and blue + White: (1, 1, 1) - red, green and blue full intensity + Black: (0, 0, 0) - no color component whatsoever + Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components + Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component + """ + + def __init__(self) -> None: + pass + + def __is_nested_list(self, lst): + """ + This function returns True if the given list is a nested list of any degree. + """ + if isinstance(lst, list): + return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst) + return False + + def __valid_rect(self, bbox): + # Ensure that the rectangle is not empty or invalid + if isinstance(bbox[0], list): + return False # It's a nested list, hence it can't be valid rect + else: + return bbox[0] < bbox[2] and bbox[1] < bbox[3] + + def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)): + """ + This function draws the nested boxes + + Parameters + ---------- + page : fitz.Page + page + nested_bbox : list + nested bbox + color : tuple + color, by default (0, 1, 1) # draw with cyan color for combined paragraph + """ + if self.__is_nested_list(nested_bbox): # If it's a nested list + for bbox in nested_bbox: + self.__draw_nested_boxes(page, bbox, color) # Recursively call the function + elif self.__valid_rect(nested_bbox): # If valid rectangle + para_rect = fitz.Rect(nested_bbox) + para_anno = page.add_rect_annot(para_rect) + para_anno.set_colors(stroke=color) # draw with cyan color for combined paragraph + para_anno.set_border(width=1) + para_anno.update() + + def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path): + """ + This function draws annotations on the pdf file. + + Parameters + ---------- + input_pdf_path : str + path to the input pdf file + pdf_dic : dict + pdf dictionary + output_pdf_path : str + path to the output pdf file + + pdf_dic : dict + pdf dictionary + """ + pdf_doc = open_pdf(input_pdf_path) + + if pdf_dic is None: + pdf_dic = {} + + if output_pdf_path is None: + output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf") + + for page_id, page in enumerate(pdf_doc): # type: ignore + page_key = f"page_{page_id}" + for ele_key, ele_data in pdf_dic[page_key].items(): + if ele_key == "para_blocks": + para_blocks = ele_data + for para_block in para_blocks: + if "paras" in para_block.keys(): + paras = para_block["paras"] + for para_key, para_content in paras.items(): + para_bbox = para_content["para_bbox"] + # print(f"para_bbox: {para_bbox}") + # print(f"is a nested list: {self.__is_nested_list(para_bbox)}") + if self.__is_nested_list(para_bbox) and len(para_bbox) > 1: + color = (0, 1, 1) + self.__draw_nested_boxes( + page, para_bbox, color + ) # draw with cyan color for combined paragraph + else: + if self.__valid_rect(para_bbox): + para_rect = fitz.Rect(para_bbox) + para_anno = page.add_rect_annot(para_rect) + para_anno.set_colors(stroke=(0, 1, 0)) # draw with green color for normal paragraph + para_anno.set_border(width=0.5) + para_anno.update() + + is_para_title = para_content["is_para_title"] + if is_para_title: + if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1: + color = (0, 0, 1) + self.__draw_nested_boxes( + page, para_content["para_bbox"], color + ) # draw with cyan color for combined title + else: + if self.__valid_rect(para_content["para_bbox"]): + para_rect = fitz.Rect(para_content["para_bbox"]) + if self.__valid_rect(para_content["para_bbox"]): + para_anno = page.add_rect_annot(para_rect) + para_anno.set_colors(stroke=(0, 0, 1)) # draw with blue color for normal title + para_anno.set_border(width=0.5) + para_anno.update() + + pdf_doc.save(output_pdf_path) + pdf_doc.close() + + +class ParaProcessPipeline: + def __init__(self) -> None: + pass + + def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None): + """ + This function processes the paragraphs, including: + 1. Read raw input json file into pdf_dic + 2. Detect and replace equations + 3. Combine spans into a natural line + 4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key + 5. Compute statistics for each block + 6. Detect titles in the document + 7. Detect paragraphs inside each block + 8. Divide the level of the titles + 9. Detect and combine paragraphs from different blocks into one paragraph + 10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable. + 11. Draw annotations on the pdf file + + Parameters + ---------- + pdf_dic_json_fpath : str + path to the pdf dictionary json file. + Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already. + input_pdf_doc : str + path to the input pdf file + output_pdf_path : str + path to the output pdf file + + Returns + ------- + pdf_dict : dict + result dictionary + """ + + error_info = None + + output_json_file = "" + output_dir = "" + + if input_pdf_path is not None: + input_pdf_path = os.path.abspath(input_pdf_path) + + # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}") + + if output_pdf_path is not None: + output_dir = os.path.dirname(output_pdf_path) + output_json_file = f"{output_dir}/pdf_dic.json" + + def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode): + """ + Save the pdf_dic to a json file + """ + output_pdf_file_name = os.path.basename(output_pdf_path) + # output_dir = os.path.dirname(output_pdf_path) + output_dir = "\\tmp\\pdf_parse" + output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json") + pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + if para_debug_mode == "full": + with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f: + json.dump(pdf_dic, f, indent=2, ensure_ascii=False) + + # Validate the output already exists + if not os.path.exists(pdf_dic_json_fpath): + print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}") + return None + else: + print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}") + + return pdf_dic_json_fpath + + """ + Preprocess the lines of block + """ + # Combine spans into a natural line + rawBlockProcessor = RawBlockProcessor() + pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict) + # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n") + + # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key + layoutFilter = LayoutFilterProcessor() + pdf_dic = layoutFilter.batch_process_blocks(pdf_dic) + + # Compute statistics for each block + blockStatisticsCalculator = BlockStatisticsCalculator() + pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic) + # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n") + + # Compute statistics for all blocks(namely this pdf document) + docStatisticsCalculator = DocStatisticsCalculator() + pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic) + # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n") + + # Dump the first three stages of pdf_dic to a json file + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode) + + """ + Detect titles in the document + """ + doc_statistics = pdf_dic["statistics"] + titleProcessor = TitleProcessor(doc_statistics) + pdf_dic = titleProcessor.batch_detect_titles(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode) + + """ + Detect and divide the level of the titles + """ + titleProcessor = TitleProcessor() + + pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode) + + """ + Detect and split paragraphs inside each block + """ + blockInnerParasProcessor = BlockTerminationProcessor() + + pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode) + + # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full") + # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}") + + """ + Detect and combine paragraphs from different blocks into one paragraph + """ + blockContinuationProcessor = BlockContinuationProcessor() + + pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic) + pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic) + + if para_debug_mode == "full": + pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode) + + # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full") + # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}") + + """ + Discard pdf files by checking exceptions and return the error info to the caller + """ + discardByException = DiscardByException() + + is_discard_by_single_line_block = discardByException.discard_by_single_line_block( + pdf_dic, exception=DenseSingleLineBlockException() + ) + is_discard_by_title_detection = discardByException.discard_by_title_detection( + pdf_dic, exception=TitleDetectionException() + ) + is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException()) + is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException()) + is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException()) + + if is_discard_by_single_line_block is not None: + error_info = is_discard_by_single_line_block + elif is_discard_by_title_detection is not None: + error_info = is_discard_by_title_detection + elif is_discard_by_title_level is not None: + error_info = is_discard_by_title_level + elif is_discard_by_split_para is not None: + error_info = is_discard_by_split_para + elif is_discard_by_merge_para is not None: + error_info = is_discard_by_merge_para + + if error_info is not None: + return pdf_dic, error_info + + """ + Dump the final pdf_dic to a json file + """ + if para_debug_mode is not None: + with open(output_json_file, "w", encoding="utf-8") as f: + json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) + + """ + Draw the annotations + """ + if para_debug_mode is not None: + drawAnnos = DrawAnnos() + drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path) + + """ + Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple + """ + if para_debug_mode is not None: + for fpath in os.listdir(output_dir): + if fpath.endswith(".json") and "stage" in fpath: + os.remove(os.path.join(output_dir, fpath)) + + return pdf_dic, error_info + + +""" +Run this script to test the function with Command: + +python detect_para.py [pdf_path] [output_pdf_path] + +Params: +- pdf_path: the path of the pdf file +- output_pdf_path: the path of the output pdf file +""" + +if __name__ == "__main__": + DEFAULT_PDF_PATH = ( + "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf" + ) + input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH + output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf" + output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json" + + import stat + + # Remove existing output file if it exists + if os.path.exists(output_pdf_path): + os.chmod(output_pdf_path, stat.S_IWRITE) + os.remove(output_pdf_path) + + input_pdf_doc = open_pdf(input_pdf_path) + + # postprocess the paragraphs + paraProcessPipeline = ParaProcessPipeline() + + # parse paragraph and save to json file + pdf_dic = {} + + blockInnerParasProcessor = BlockTerminationProcessor() + + """ + Construct the pdf dictionary. + """ + + for page_id, page in enumerate(input_pdf_doc): # type: ignore + # print(f"Processing page {page_id}") + # print(f"page: {page}") + raw_blocks = page.get_text("dict")["blocks"] + + # Save text blocks to "preproc_blocks" + preproc_blocks = [] + for block in raw_blocks: + if block["type"] == 0: + preproc_blocks.append(block) + + layout_bboxes = [] + + # Construct the pdf dictionary as schema above + page_dict = { + "para_blocks": None, + "preproc_blocks": preproc_blocks, + "images": None, + "tables": None, + "interline_equations": None, + "inline_equations": None, + "layout_bboxes": None, + "pymu_raw_blocks": None, + "global_statistic": None, + "droped_text_block": None, + "droped_image_block": None, + "droped_table_block": None, + "image_backup": None, + "table_backup": None, + } + + pdf_dic[f"page_{page_id}"] = page_dict + + # print(f"pdf_dic: {pdf_dic}") + + with open(output_json_path, "w", encoding="utf-8") as f: + json.dump(pdf_dic, f, ensure_ascii=False, indent=4) + + pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path) diff --git a/magic_pdf/post_proc/pdf_post_filter.py b/magic_pdf/post_proc/pdf_post_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..862e666079ea999e45cfd9a0563051967de709a2 --- /dev/null +++ b/magic_pdf/post_proc/pdf_post_filter.py @@ -0,0 +1,67 @@ +from loguru import logger + +from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout +from magic_pdf.libs.drop_reason import DropReason + + +def __is_pseudo_single_column(page_info) -> bool: + """ + 判断一个页面是否伪单列。 + + Args: + page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。 + + Returns: + Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。 + + """ + layout_tree = page_info['_layout_tree'] + layout_column_width = get_columns_cnt_of_layout(layout_tree) + if layout_column_width == 1: + text_blocks = page_info['preproc_blocks'] + # 遍历每一个text_block + for text_block in text_blocks: + lines = text_block['lines'] + num_lines = len(lines) + num_satisfying_lines = 0 + + for i in range(num_lines - 1): + current_line = lines[i] + next_line = lines[i + 1] + + # 获取当前line和下一个line的bbox属性 + current_bbox = current_line['bbox'] + next_bbox = next_line['bbox'] + + # 检查是否满足条件 + if next_bbox[0] > current_bbox[2] or next_bbox[2] < current_bbox[0]: + num_satisfying_lines += 1 + # 如果有一半以上的line满足条件,就drop + # print("num_satisfying_lines:", num_satisfying_lines, "num_lines:", num_lines) + if num_lines > 20: + radio = num_satisfying_lines / num_lines + if radio >= 0.5: + extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}" + block_text = [] + for line in lines: + if line['spans']: + for span in line['spans']: + block_text.append(span['text']) + logger.warning(f"pseudo_single_column block_text: {block_text}") + return True, extra_info + + return False, None + + +def pdf_post_filter(page_info) -> tuple: + """ + return:(True|False, err_msg) + True, 如果pdf符合要求 + False, 如果pdf不符合要求 + + """ + bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info) + if bool_is_pseudo_single_column: + return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info} + + return True, None \ No newline at end of file diff --git a/magic_pdf/post_proc/remove_footnote.py b/magic_pdf/post_proc/remove_footnote.py new file mode 100644 index 0000000000000000000000000000000000000000..976d3a6a4498f8bb3467a38b635420bede8c9396 --- /dev/null +++ b/magic_pdf/post_proc/remove_footnote.py @@ -0,0 +1,153 @@ +from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap +import collections # 统计库 + + + +def is_below(bbox1, bbox2): + # 如果block1的上边y坐标大于block2的下边y坐标,那么block1在block2下面 + return bbox1[1] > bbox2[3] + + +def merge_bboxes(bboxes): + # 找出所有blocks的最小x0,最大y1,最大x1,最小y0,这就是合并后的bbox + x0 = min(bbox[0] for bbox in bboxes) + y0 = min(bbox[1] for bbox in bboxes) + x1 = max(bbox[2] for bbox in bboxes) + y1 = max(bbox[3] for bbox in bboxes) + return [x0, y0, x1, y1] + + +def merge_footnote_blocks(page_info, main_text_font): + page_info['merged_bboxes'] = [] + for layout in page_info['layout_bboxes']: + # 找出layout中的所有footnote blocks和preproc_blocks + footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])] + # 如果没有footnote_blocks,就跳过这个layout + if not footnote_bboxes: + continue + + preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])] + # preproc_bboxes = [block['bbox'] for block in preproc_blocks] + font_names = collections.Counter() + if len(preproc_blocks) > 0: + # 存储每一行的文本块大小的列表 + line_sizes = [] + # 存储每个文本块的平均行大小 + block_sizes = [] + for block in preproc_blocks: + block_line_sizes = [] + block_fonts = collections.Counter() + for line in block['lines']: + # 提取每个span的size属性,并计算行大小 + span_sizes = [span['size'] for span in line['spans'] if 'size' in span] + if span_sizes: + line_size = sum(span_sizes) / len(span_sizes) + line_sizes.append(line_size) + block_line_sizes.append(line_size) + span_font = [(span['font'], len(span['text'])) for span in line['spans'] if + 'font' in span and len(span['text']) > 0] + if span_font: + # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计 + # font_names.append(font_name for font_name in span_font) + # block_fonts.append(font_name for font_name in span_font) + for font, count in span_font: + # font_names.extend([font] * count) + # block_fonts.extend([font] * count) + font_names[font] += count + block_fonts[font] += count + if block_line_sizes: + # 计算文本块的平均行大小 + block_size = sum(block_line_sizes) / len(block_line_sizes) + block_font = block_fonts.most_common(1)[0][0] + block_sizes.append((block, block_size, block_font)) + + # 计算main_text_size + # main_text_font = font_names.most_common(1)[0][0] + main_text_size = collections.Counter(line_sizes).most_common(1)[0][0] + else: + continue + + need_merge_bboxes = [] + # 任何一个下面有正文block的footnote bbox都是假footnote + for footnote_bbox in footnote_bboxes: + # 检测footnote下面是否有正文block(正文block需满足,block平均size大于等于main_text_size,且block行数大于等于5) + main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if + is_below(block['bbox'], footnote_bbox) and + sum([size >= main_text_size, + len(block['lines']) >= 5, + block_font == main_text_font]) + >= 2] + # 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过 + if len(main_text_bboxes_below) > 0: + continue + else: + # 否则,说明footnote下面没有正文block,这个footnote成立,添加到待merge的footnote_bboxes中 + need_merge_bboxes.append(footnote_bbox) + if len(need_merge_bboxes) == 0: + continue + # 找出最靠上的footnote block + top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1]) + # 找出所有在top_footnote_block下面的preproc_blocks,并确保这些preproc_blocks的平均行大小小于main_text_size + bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)] + # # 找出所有在top_footnote_block下面的preproc_blocks + # bboxes_below = [bbox for bbox in preproc_bboxes if is_below(bbox, top_footnote_bbox)] + # 合并top_footnote_block和blocks_below + merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below) + # 添加到新的footnote_bboxes_tmp中 + page_info['merged_bboxes'].append(merged_bbox) + return page_info + + +def remove_footnote_blocks(page_info): + if page_info.get('merged_bboxes'): + # 从文字中去掉footnote + remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes']) + # 从图片中去掉footnote + image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes']) + # 更新page_info + page_info['preproc_blocks'] = remain_text_blocks + page_info['images'] = image_blocks + page_info['droped_text_block'].extend(removed_footnote_text_blocks) + page_info['droped_image_block'].extend(removed_footnote_imgs_blocks) + # 删除footnote_bboxes_tmp和merged_bboxes + del page_info['merged_bboxes'] + del page_info['footnote_bboxes_tmp'] + return page_info + + +def remove_footnote_text(raw_text_block, footnote_bboxes): + """ + :param raw_text_block: str类型,是当前页的文本内容 + :param footnoteBboxes: list类型,是当前页的脚注bbox + """ + footnote_text_blocks = [] + for block in raw_text_block: + text_bbox = block['bbox'] + # TODO 更严谨点在line级别做 + if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]): + # if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]): + block['tag'] = 'footnote' + footnote_text_blocks.append(block) + # raw_text_block.remove(block) + + # 移除,不能再内部移除,否则会出错 + for block in footnote_text_blocks: + raw_text_block.remove(block) + + return raw_text_block, footnote_text_blocks + + +def remove_footnote_image(image_blocks, footnote_bboxes): + """ + :param image_bboxes: list类型,是当前页的图片bbox(结构体) + :param footnoteBboxes: list类型,是当前页的脚注bbox + """ + footnote_imgs_blocks = [] + for image_block in image_blocks: + if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]): + footnote_imgs_blocks.append(image_block) + + for footnote_imgs_block in footnote_imgs_blocks: + image_blocks.remove(footnote_imgs_block) + + return image_blocks, footnote_imgs_blocks \ No newline at end of file diff --git a/magic_pdf/pre_proc/__init__.py b/magic_pdf/pre_proc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/pre_proc/citationmarker_remove.py b/magic_pdf/pre_proc/citationmarker_remove.py new file mode 100644 index 0000000000000000000000000000000000000000..2a2c485f78764e32354f95f75965d2546d98fe3f --- /dev/null +++ b/magic_pdf/pre_proc/citationmarker_remove.py @@ -0,0 +1,157 @@ +""" +去掉正文的引文引用marker +https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d +""" +import re +# from magic_pdf.libs.nlp_utils import NLPModels + + +# __NLP_MODEL = NLPModels() + +def check_1(spans, cur_span_i): + """寻找前一个char,如果是句号,逗号,那么就是角标""" + if cur_span_i==0: + return False # 不是角标 + pre_span = spans[cur_span_i-1] + pre_char = pre_span['chars'][-1]['c'] + if pre_char in ['。', ',', '.', ',']: + return True + + return False + + +# def check_2(spans, cur_span_i): +# """检查前面一个span的最后一个单词,如果长度大于5,全都是字母,并且不含大写,就是角标""" +# pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写 +# +# if cur_span_i==0 and len(spans)>1: +# next_span = spans[cur_span_i+1] +# next_txt = "".join([c['c'] for c in next_span['chars']]) +# result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt) +# if result in ["PERSON", "GPE", "ORG"]: +# return True +# +# if re.findall(pattern, next_txt): +# return True +# +# return False # 不是角标 +# elif cur_span_i==0 and len(spans)==1: # 角标占用了整行?谨慎删除 +# return False +# +# # 如果这个span是最后一个span, +# if cur_span_i==len(spans)-1: +# pre_span = spans[cur_span_i-1] +# pre_txt = "".join([c['c'] for c in pre_span['chars']]) +# pre_word = pre_txt.split(' ')[-1] +# result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt) +# if result in ["PERSON", "GPE", "ORG"]: +# return True +# +# if re.findall(pattern, pre_txt): +# return True +# +# return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower() +# else: # 既不是第一个span,也不是最后一个span,那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标 +# pre_span = spans[cur_span_i-1] +# next_span = spans[cur_span_i+1] +# cur_span = spans[cur_span_i] +# # 找到前一个和后一个span里的距离最近的单词 +# pre_distance = 10000 # 一个很大的数 +# next_distance = 10000 # 一个很大的数 +# for c in pre_span['chars'][::-1]: +# if c['c'].isalpha(): +# pre_distance = cur_span['bbox'][0] - c['bbox'][2] +# break +# for c in next_span['chars']: +# if c['c'].isalpha(): +# next_distance = c['bbox'][0] - cur_span['bbox'][2] +# break +# +# if pre_distance 5 and pre_word.isalpha() and pre_word.islower() + + +def check_3(spans, cur_span_i): + """上标里有[], 有*, 有-, 有逗号""" + # 如[2-3],[22] + # 如 2,3,4 + cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip() + bad_char = ['[', ']', '*', ','] + + if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt): + return True + + # 如2-3, a-b + patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]'] + for pattern in patterns: + match = re.match(pattern, cur_span_txt) + if match is not None: + return True + + return False + + +def remove_citation_marker(with_char_text_blcoks): + for blk in with_char_text_blcoks: + for line in blk['lines']: + # 如果span里的个数少于2个,那只能忽略,角标不可能自己独占一行 + if len(line['spans'])<=1: + continue + + # 找到高度最高的span作为位置比较的基准 + max_hi_span = line['spans'][0]['bbox'] + min_font_sz = 10000 # line里最小的字体 + max_font_sz = 0 # line里最大的字体 + + for s in line['spans']: + if max_hi_span[3]-max_hi_span[1]s['size']: + min_font_sz = s['size'] + if max_font_sz0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1): + """ + 1. 它的前一个char如果是句号或者逗号的话,那么肯定是角标而不是公式 + 2. 如果这个角标的前面是一个单词(长度大于5)而不是任何大写或小写的短字母的话 应该也是角标 + 3. 上标里有数字和逗号或者数字+星号的组合,方括号,一般肯定就是角标了 + 4. 这个角标属于前文还是后文要根据距离来判断,如果距离前面的文本太近,那么就是前面的角标,否则就是后面的角标 + """ + if (check_1(line['spans'], i) or + # check_2(line['spans'], i) or + check_3(line['spans'], i) + ): + """删除掉这个角标:删除这个span, 同时还要更新line的text""" + span_to_del.append(span) + if len(span_to_del)>0: + for span in span_to_del: + line['spans'].remove(span) + line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']]) + + return with_char_text_blcoks diff --git a/magic_pdf/pre_proc/construct_page_dict.py b/magic_pdf/pre_proc/construct_page_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..c2f83c1005ba507dee65f65aaa52ee34b805b477 --- /dev/null +++ b/magic_pdf/pre_proc/construct_page_dict.py @@ -0,0 +1,72 @@ +def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info, + interline_eq_info, raw_pymu_blocks, + removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup, + layout_tree, + page_w, page_h, footnote_bboxes_tmp): + """ + + """ + return_dict = {} + + return_dict['para_blocks'] = {} + return_dict['preproc_blocks'] = text_blocks_preproc + return_dict['images'] = image_info + return_dict['tables'] = table_info + return_dict['interline_equations'] = interline_eq_info + return_dict['inline_equations'] = inline_eq_info + return_dict['layout_bboxes'] = layout_bboxes + return_dict['pymu_raw_blocks'] = raw_pymu_blocks + return_dict['global_statistic'] = {} + + return_dict['droped_text_block'] = removed_text_blocks + return_dict['droped_image_block'] = removed_image_blocks + return_dict['droped_table_block'] = [] + return_dict['image_backup'] = images_backup + return_dict['table_backup'] = [] + return_dict['page_idx'] = page_id + return_dict['page_size'] = [page_w, page_h] + return_dict['_layout_tree'] = layout_tree # 辅助分析layout作用 + return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp + + return return_dict + + +def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, + images, tables, interline_equations, inline_equations, + dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block, + need_remove_spans_bboxes_dict): + return_dict = { + 'preproc_blocks': blocks, + 'layout_bboxes': layout_bboxes, + 'page_idx': page_id, + 'page_size': [page_w, page_h], + '_layout_tree': layout_tree, + 'images': images, + 'tables': tables, + 'interline_equations': interline_equations, + 'inline_equations': inline_equations, + 'droped_text_block': dropped_text_block, + 'droped_image_block': dropped_image_block, + 'droped_table_block': dropped_table_block, + 'dropped_equation_block': dropped_equation_block, + 'droped_bboxes': need_remove_spans_bboxes_dict, + } + return return_dict + + +def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree, + images, tables, interline_equations, discarded_blocks, need_drop, drop_reason): + return_dict = { + 'preproc_blocks': blocks, + 'layout_bboxes': layout_bboxes, + 'page_idx': page_id, + 'page_size': [page_w, page_h], + '_layout_tree': layout_tree, + 'images': images, + 'tables': tables, + 'interline_equations': interline_equations, + 'discarded_blocks': discarded_blocks, + 'need_drop': need_drop, + 'drop_reason': drop_reason, + } + return return_dict diff --git a/magic_pdf/pre_proc/cut_image.py b/magic_pdf/pre_proc/cut_image.py new file mode 100644 index 0000000000000000000000000000000000000000..18ee65129d0d07b14bbd5aadec16f91b646cf06d --- /dev/null +++ b/magic_pdf/pre_proc/cut_image.py @@ -0,0 +1,71 @@ +from loguru import logger + +from magic_pdf.libs.commons import join_path +from magic_pdf.libs.ocr_content_type import ContentType +from magic_pdf.libs.pdf_image_tools import cut_image + + +def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter): + def return_path(type): + return join_path(pdf_bytes_md5, type) + + for span in spans: + span_type = span['type'] + if span_type == ContentType.Image: + if not check_img_bbox(span['bbox']): + continue + span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'), + imageWriter=imageWriter) + elif span_type == ContentType.Table: + if not check_img_bbox(span['bbox']): + continue + span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'), + imageWriter=imageWriter) + + return spans + + +def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str, + image_bboxes: list, images_overlap_backup: list, table_bboxes: list, + equation_inline_bboxes: list, + equation_interline_bboxes: list, imageWriter) -> dict: + """ + 返回一个dict, key为bbox, 值是图片地址 + """ + image_info = [] + image_backup_info = [] + table_info = [] + inline_eq_info = [] + interline_eq_info = [] + + # 图片的保存路径组成是这样的: {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg + + def return_path(type): + return join_path(pdf_bytes_md5, type) + + for bbox in image_bboxes: + if not check_img_bbox(bbox): + continue + image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter) + image_info.append({"bbox": bbox, "image_path": image_path}) + + for bbox in images_overlap_backup: + if not check_img_bbox(bbox): + continue + image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter) + image_backup_info.append({"bbox": bbox, "image_path": image_path}) + + for bbox in table_bboxes: + if not check_img_bbox(bbox): + continue + image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter) + table_info.append({"bbox": bbox, "image_path": image_path}) + + return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info + + +def check_img_bbox(bbox) -> bool: + if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]): + logger.warning(f"image_bboxes: 错误的box, {bbox}") + return False + return True diff --git a/magic_pdf/pre_proc/detect_equation.py b/magic_pdf/pre_proc/detect_equation.py new file mode 100644 index 0000000000000000000000000000000000000000..f395030c4233db92023512b201f896e5d814f03c --- /dev/null +++ b/magic_pdf/pre_proc/detect_equation.py @@ -0,0 +1,134 @@ +from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则 +from magic_pdf.libs.commons import fitz # pyMuPDF库 + + +def __solve_contain_bboxs(all_bbox_list: list): + + """将两个公式的bbox做判断是否有包含关系,若有的话则删掉较小的bbox""" + + dump_list = [] + for i in range(len(all_bbox_list)): + for j in range(i + 1, len(all_bbox_list)): + # 获取当前两个值 + bbox1 = all_bbox_list[i][:4] + bbox2 = all_bbox_list[j][:4] + + # 删掉较小的框 + if _is_in(bbox1, bbox2): + dump_list.append(all_bbox_list[i]) + elif _is_in(bbox2, bbox1): + dump_list.append(all_bbox_list[j]) + else: + ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2) + if ratio > 0.7: + s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) + s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) + if s2 > s1: + dump_list.append(all_bbox_list[i]) + else: + dump_list.append(all_bbox_list[i]) + + # 遍历需要删除的列表中的每个元素 + for item in dump_list: + + while item in all_bbox_list: + all_bbox_list.remove(item) + return all_bbox_list + + +def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + DPI = 72 # use this resolution + pix = page.get_pixmap(dpi=DPI) + pageL = 0 + pageR = int(pix.w) + pageU = 0 + pageD = int(pix.h) + + + #--------- 通过json_from_DocXchain来获取 table ---------# + equationEmbedding_from_DocXChain_bboxs = [] + equationIsolated_from_DocXChain_bboxs = [] + + xf_json = json_from_DocXchain_obj + width_from_json = xf_json['page_info']['width'] + height_from_json = xf_json['page_info']['height'] + LR_scaleRatio = width_from_json / (pageR - pageL) + UD_scaleRatio = height_from_json / (pageD - pageU) + + for xf in xf_json['layout_dets']: + # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'} + L = xf['poly'][0] / LR_scaleRatio + U = xf['poly'][1] / UD_scaleRatio + R = xf['poly'][2] / LR_scaleRatio + D = xf['poly'][5] / UD_scaleRatio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + # equation + img_suffix = f"{page_ID}_{int(L)}_{int(U)}_{int(R)}_{int(D)}" + if xf['category_id'] == 13 and xf['score'] >= 0.3: + latex_text = xf.get("latex", "EmptyInlineEquationResult") + debugable_latex_text = f"{latex_text}|{img_suffix}" + equationEmbedding_from_DocXChain_bboxs.append((L, U, R, D, latex_text)) + if xf['category_id'] == 14 and xf['score'] >= 0.3: + latex_text = xf.get("latex", "EmptyInterlineEquationResult") + debugable_latex_text = f"{latex_text}|{img_suffix}" + equationIsolated_from_DocXChain_bboxs.append((L, U, R, D, latex_text)) + + #---------------------------------------- 排序,编号,保存 -----------------------------------------# + equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + + equationEmbedding_from_DocXChain_names = [] + equationEmbedding_ID = 0 + + equationIsolated_from_DocXChain_names = [] + equationIsolated_ID = 0 + + for L, U, R, D, _ in equationEmbedding_from_DocXChain_bboxs: + if not(L < R and U < D): + continue + try: + # cur_equation = page.get_pixmap(clip=(L,U,R,D)) + new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationEmbedding_ID) # 公式name + # cur_equation.save(res_dir_path + '/' + new_equation_name) # 把公式存出在新建的文件夹,并命名 + equationEmbedding_from_DocXChain_names.append(new_equation_name) # 把公式的名字存在list中,方便在md中插入引用 + equationEmbedding_ID += 1 + except: + pass + + for L, U, R, D, _ in equationIsolated_from_DocXChain_bboxs: + if not(L < R and U < D): + continue + try: + # cur_equation = page.get_pixmap(clip=(L,U,R,D)) + new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationIsolated_ID) # 公式name + # cur_equation.save(res_dir_path + '/' + new_equation_name) # 把公式存出在新建的文件夹,并命名 + equationIsolated_from_DocXChain_names.append(new_equation_name) # 把公式的名字存在list中,方便在md中插入引用 + equationIsolated_ID += 1 + except: + pass + + equationEmbedding_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + + + """根据pdf可视区域,调整bbox的坐标""" + cropbox = page.cropbox + if cropbox[0]!=page.rect[0] or cropbox[1]!=page.rect[1]: + for eq_box in equationEmbedding_from_DocXChain_bboxs: + eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]] + for eq_box in equationIsolated_from_DocXChain_bboxs: + eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]] + + deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs) + return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs diff --git a/magic_pdf/pre_proc/detect_footer_by_model.py b/magic_pdf/pre_proc/detect_footer_by_model.py new file mode 100644 index 0000000000000000000000000000000000000000..0c1fbf38b6c5a61b477b0aab594f966ef3d2676e --- /dev/null +++ b/magic_pdf/pre_proc/detect_footer_by_model.py @@ -0,0 +1,64 @@ +from magic_pdf.libs.commons import fitz # pyMuPDF库 +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + + #--------- 通过json_from_DocXchain来获取 footer ---------# + footer_bbox_from_DocXChain = [] + + xf_json = json_from_DocXchain_obj + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page) + + # {0: 'title', # 标题 + # 1: 'figure', # 图片 + # 2: 'plain text', # 文本 + # 3: 'header', # 页眉 + # 4: 'page number', # 页码 + # 5: 'footnote', # 脚注 + # 6: 'footer', # 页脚 + # 7: 'table', # 表格 + # 8: 'table caption', # 表格描述 + # 9: 'figure caption', # 图片描述 + # 10: 'equation', # 公式 + # 11: 'full column', # 单栏 + # 12: 'sub column', # 多栏 + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + for xf in xf_json['layout_dets']: + L = xf['poly'][0] / horizontal_scale_ratio + U = xf['poly'][1] / vertical_scale_ratio + R = xf['poly'][2] / horizontal_scale_ratio + D = xf['poly'][5] / vertical_scale_ratio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + if xf['category_id'] == 6 and xf['score'] >= 0.3: + footer_bbox_from_DocXChain.append((L, U, R, D)) + + + footer_final_names = [] + footer_final_bboxs = [] + footer_ID = 0 + for L, U, R, D in footer_bbox_from_DocXChain: + # cur_footer = page.get_pixmap(clip=(L,U,R,D)) + new_footer_name = "footer_{}_{}.png".format(page_ID, footer_ID) # 脚注name + # cur_footer.save(res_dir_path + '/' + new_footer_name) # 把页脚存储在新建的文件夹,并命名 + footer_final_names.append(new_footer_name) # 把脚注的名字存在list中 + footer_final_bboxs.append((L, U, R, D)) + footer_ID += 1 + + + footer_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + curPage_all_footer_bboxs = footer_final_bboxs + return curPage_all_footer_bboxs + diff --git a/magic_pdf/pre_proc/detect_footer_header_by_statistics.py b/magic_pdf/pre_proc/detect_footer_header_by_statistics.py new file mode 100644 index 0000000000000000000000000000000000000000..340965d0ea92c3aa75e4a428da4cdff728db1124 --- /dev/null +++ b/magic_pdf/pre_proc/detect_footer_header_by_statistics.py @@ -0,0 +1,284 @@ +from collections import defaultdict + +from magic_pdf.libs.boxbase import calculate_iou + + +def compare_bbox_with_list(bbox, bbox_list, tolerance=1): + return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list) + +def is_single_line_block(block): + # Determine based on the width and height of the block + block_width = block["X1"] - block["X0"] + block_height = block["bbox"][3] - block["bbox"][1] + + # If the height of the block is close to the average character height and the width is large, it is considered a single line + return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3 + +def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2): + """ + This function gets the most common bboxes from the bboxes + + Parameters + ---------- + bboxes : list + bboxes + page_height : float + height of the page + position : str, optional + "top" or "bottom", by default "top" + threshold : float, optional + threshold, by default 0.25 + num_bboxes : int, optional + number of bboxes to return, by default 3 + min_frequency : int, optional + minimum frequency of the bbox, by default 2 + + Returns + ------- + common_bboxes : list + common bboxes + """ + # Filter bbox by position + if position == "top": + filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold] + else: + filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)] + + # Find the most common bbox + bbox_count = defaultdict(int) + for bbox in filtered_bboxes: + bbox_count[tuple(bbox)] += 1 + + # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency + common_bboxes = [ + bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency + ][:num_bboxes] + return common_bboxes + +def detect_footer_header2(result_dict, similarity_threshold=0.5): + """ + This function detects the header and footer of the document. + + Parameters + ---------- + result_dict : dict + result dictionary + + Returns + ------- + result_dict : dict + result dictionary + """ + # Traverse all blocks in the document + single_line_blocks = 0 + total_blocks = 0 + single_line_blocks = 0 + + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_key, block in blocks.items(): + if block_key.startswith("block_"): + total_blocks += 1 + if is_single_line_block(block): + single_line_blocks += 1 + + # If there are no blocks, skip the header and footer detection + if total_blocks == 0: + print("No blocks found. Skipping header/footer detection.") + return result_dict + + # If most of the blocks are single-line, skip the header and footer detection + if single_line_blocks / total_blocks > 0.5: # 50% of the blocks are single-line + # print("Skipping header/footer detection for text-dense document.") + return result_dict + + # Collect the bounding boxes of all blocks + all_bboxes = [] + all_texts = [] + + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_key, block in blocks.items(): + if block_key.startswith("block_"): + all_bboxes.append(block["bbox"]) + + # Get the height of the page + page_height = max(bbox[3] for bbox in all_bboxes) + + # Get the most common bbox lists for headers and footers + common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else [] + common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else [] + + # Detect and mark headers and footers + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_key, block in blocks.items(): + if block_key.startswith("block_"): + bbox = block["bbox"] + text = block["text"] + + is_header = compare_bbox_with_list(bbox, common_header_bboxes) + is_footer = compare_bbox_with_list(bbox, common_footer_bboxes) + block["is_header"] = int(is_header) + block["is_footer"] = int(is_footer) + + return result_dict + + +def __get_page_size(page_sizes:list): + """ + 页面大小可能不一样 + """ + w = sum([w for w,h in page_sizes])/len(page_sizes) + h = sum([h for w,h in page_sizes])/len(page_sizes) + return w, h + +def __calculate_iou(bbox1, bbox2): + iou = calculate_iou(bbox1, bbox2) + return iou + +def __is_same_pos(box1, box2, iou_threshold): + iou = __calculate_iou(box1, box2) + return iou >= iou_threshold + + +def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int, page_range_threshold=0.2, iou_threshold=0.9): + """ + common bbox必须大于page_cnt的1/3 + """ + min_occurance_cnt = max(3, page_cnt//4) + header_det_bbox = [] + footer_det_bbox = [] + + hdr_same_pos_group = [] + btn_same_pos_group = [] + + page_w, page_h = __get_page_size(page_size) + top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold) + + top_bbox = [b for b in bboxes if b[3]bottom_y] + # 然后开始排序,寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个 + for i in range(0, len(top_bbox)): + hdr_same_pos_group.append([top_bbox[i]]) + for j in range(i+1, len(top_bbox)): + if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold): + #header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])] + hdr_same_pos_group[i].append(top_bbox[j]) + + for i in range(0, len(bottom_bbox)): + btn_same_pos_group.append([bottom_bbox[i]]) + for j in range(i+1, len(bottom_bbox)): + if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold): + #footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])] + btn_same_pos_group[i].append(bottom_bbox[j]) + + # 然后看下每一组的bbox,是否符合大于page_cnt一定比例 + hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt] + btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt] + + # 平铺2个list[list] + hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g] + btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g] + # 寻找hdr_same_pos_group中的box[3]最大值,btn_same_pos_group中的box[1]最小值 + hdr_same_pos_group.sort(key=lambda b:b[3]) + btn_same_pos_group.sort(key=lambda b:b[1]) + + hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0 + btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h + + header_det_bbox = [0, 0, page_w, hdr_y] + footer_det_bbox = [0, btn_y, page_w, page_h] + # logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}") + return header_det_bbox, footer_det_bbox, page_w, page_h + + +def drop_footer_header(pdf_info_dict:dict): + """ + 启用规则探测,在全局的视角上通过统计的方法。 + """ + header = [] + footer = [] + + all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']] + image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']] + page_size = [val['page_size'] for _, val in pdf_info_dict.items()] + page_cnt = len(pdf_info_dict.keys()) # 一共多少页 + header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt) + + """" + 把范围扩展到页面水平的整个方向上 + """ + if header: + header = [0, 0, page_w, header[3]+1] + + if footer: + footer = [0, footer[1]-1, page_w, page_h] + + # 找到footer, header范围之后,针对每一页pdf,从text、图片中删除这些范围内的内容 + # 移除text block + + for _, page_info in pdf_info_dict.items(): + header_text_blk = [] + footer_text_blk = [] + for blk in page_info['preproc_blocks']: + blk_bbox = blk['bbox'] + if header and blk_bbox[3]<=header[3]: + blk['tag'] = "header" + header_text_blk.append(blk) + elif footer and blk_bbox[1]>=footer[1]: + blk['tag'] = "footer" + footer_text_blk.append(blk) + + # 放入text_block_droped中 + page_info['droped_text_block'].extend(header_text_blk) + page_info['droped_text_block'].extend(footer_text_blk) + + for blk in header_text_blk: + page_info['preproc_blocks'].remove(blk) + for blk in footer_text_blk: + page_info['preproc_blocks'].remove(blk) + + """接下来把footer、header上的图片也删除掉。图片包括正常的和backup的""" + header_image = [] + footer_image = [] + + for image_info in page_info['images']: + img_bbox = image_info['bbox'] + if header and img_bbox[3]<=header[3]: + image_info['tag'] = "header" + header_image.append(image_info) + elif footer and img_bbox[1]>=footer[1]: + image_info['tag'] = "footer" + footer_image.append(image_info) + + page_info['droped_image_block'].extend(header_image) + page_info['droped_image_block'].extend(footer_image) + + for img in header_image: + page_info['images'].remove(img) + for img in footer_image: + page_info['images'].remove(img) + + """接下来吧backup的图片也删除掉""" + header_image = [] + footer_image = [] + + for image_info in page_info['image_backup']: + img_bbox = image_info['bbox'] + if header and img_bbox[3]<=header[3]: + image_info['tag'] = "header" + header_image.append(image_info) + elif footer and img_bbox[1]>=footer[1]: + image_info['tag'] = "footer" + footer_image.append(image_info) + + page_info['droped_image_block'].extend(header_image) + page_info['droped_image_block'].extend(footer_image) + + for img in header_image: + page_info['image_backup'].remove(img) + for img in footer_image: + page_info['image_backup'].remove(img) + + return header, footer diff --git a/magic_pdf/pre_proc/detect_footnote.py b/magic_pdf/pre_proc/detect_footnote.py new file mode 100644 index 0000000000000000000000000000000000000000..4f903c85582dd37b6ec8a3efc4165f39eaac58ee --- /dev/null +++ b/magic_pdf/pre_proc/detect_footnote.py @@ -0,0 +1,170 @@ +from collections import Counter +from magic_pdf.libs.commons import fitz # pyMuPDF库 +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + + #--------- 通过json_from_DocXchain来获取 footnote ---------# + footnote_bbox_from_DocXChain = [] + + xf_json = json_from_DocXchain_obj + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page) + + # {0: 'title', # 标题 + # 1: 'figure', # 图片 + # 2: 'plain text', # 文本 + # 3: 'header', # 页眉 + # 4: 'page number', # 页码 + # 5: 'footnote', # 脚注 + # 6: 'footer', # 页脚 + # 7: 'table', # 表格 + # 8: 'table caption', # 表格描述 + # 9: 'figure caption', # 图片描述 + # 10: 'equation', # 公式 + # 11: 'full column', # 单栏 + # 12: 'sub column', # 多栏 + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + for xf in xf_json['layout_dets']: + L = xf['poly'][0] / horizontal_scale_ratio + U = xf['poly'][1] / vertical_scale_ratio + R = xf['poly'][2] / horizontal_scale_ratio + D = xf['poly'][5] / vertical_scale_ratio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + # if xf['category_id'] == 5 and xf['score'] >= 0.3: + if xf['category_id'] == 5 and xf['score'] >= 0.43: # 新的footnote阈值 + footnote_bbox_from_DocXChain.append((L, U, R, D)) + + + footnote_final_names = [] + footnote_final_bboxs = [] + footnote_ID = 0 + for L, U, R, D in footnote_bbox_from_DocXChain: + if debug_mode: + # cur_footnote = page.get_pixmap(clip=(L,U,R,D)) + new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID) # 脚注name + # cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name) # 把脚注存储在新建的文件夹,并命名 + footnote_final_names.append(new_footnote_name) # 把脚注的名字存在list中 + footnote_final_bboxs.append((L, U, R, D)) + footnote_ID += 1 + + + footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + curPage_all_footnote_bboxs = footnote_final_bboxs + return curPage_all_footnote_bboxs + + +def need_remove(block): + if 'lines' in block and len(block['lines']) > 0: + # block中只有一行,且该行文本全是大写字母,或字体为粗体bold关键词,SB关键词,把这个block捞回来 + if len(block['lines']) == 1: + if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1: + font_keywords = ['SB', 'bold', 'Bold'] + if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords): + return True + for line in block['lines']: + if 'spans' in line and len(line['spans']) > 0: + for span in line['spans']: + # 检测"keyword"是否在span中,忽略大小写 + if "keyword" in span['text'].lower(): + return True + return False + +def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font): + """ + 根据给定的文本块、页高和页码,解析出符合规则的脚注文本块,并返回其边界框。 + + Args: + remain_text_blocks (list): 包含所有待处理的文本块的列表。 + page_height (float): 页面的高度。 + page_id (int): 页面的ID。 + + Returns: + list: 符合规则的脚注文本块的边界框列表。 + + """ + # if page_id > 20: + if page_id > 2: # 为保证精确度,先只筛选前3页 + return [] + else: + # 存储每一行的文本块大小的列表 + line_sizes = [] + # 存储每个文本块的平均行大小 + block_sizes = [] + # 存储每一行的字体信息 + # font_names = [] + font_names = Counter() + if len(remain_text_blocks) > 0: + for block in remain_text_blocks: + block_line_sizes = [] + # block_fonts = [] + block_fonts = Counter() + for line in block['lines']: + # 提取每个span的size属性,并计算行大小 + span_sizes = [span['size'] for span in line['spans'] if 'size' in span] + if span_sizes: + line_size = sum(span_sizes) / len(span_sizes) + line_sizes.append(line_size) + block_line_sizes.append(line_size) + span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0] + if span_font: + # main_text_font应该用基于字数最多的字体而不是span级别的统计 + # font_names.append(font_name for font_name in span_font) + # block_fonts.append(font_name for font_name in span_font) + for font, count in span_font: + # font_names.extend([font] * count) + # block_fonts.extend([font] * count) + font_names[font] += count + block_fonts[font] += count + if block_line_sizes: + # 计算文本块的平均行大小 + block_size = sum(block_line_sizes) / len(block_line_sizes) + # block_font = collections.Counter(block_fonts).most_common(1)[0][0] + block_font = block_fonts.most_common(1)[0][0] + block_sizes.append((block, block_size, block_font)) + + # 计算main_text_size + main_text_size = Counter(line_sizes).most_common(1)[0][0] + # 计算main_text_font + # main_text_font = collections.Counter(font_names).most_common(1)[0][0] + # main_text_font = font_names.most_common(1)[0][0] + # 删除一些可能被误识别为脚注的文本块 + block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)] + + # 检测footnote_block 并返回 footnote_bboxes + # footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if + # block['bbox'][1] > page_height * 0.6 and block_size < main_text_size + # and (len(block['lines']) < 5 or block_font != main_text_font)] + # and len(block['lines']) < 5] + footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if + block['bbox'][1] > page_height * 0.6 and + # 较为严格的规则 + block_size < main_text_size and + (len(block['lines']) < 5 or + block_font != main_text_font)] + + # 较为宽松的规则 + # sum([block_size < main_text_size, + # len(block['lines']) < 5, + # block_font != main_text_font]) + # >= 2] + + + return footnote_bboxes + else: + return [] + + + diff --git a/magic_pdf/pre_proc/detect_header.py b/magic_pdf/pre_proc/detect_header.py new file mode 100644 index 0000000000000000000000000000000000000000..670eccd3db39983dd730155b4df14a664d690317 --- /dev/null +++ b/magic_pdf/pre_proc/detect_header.py @@ -0,0 +1,64 @@ +from magic_pdf.libs.commons import fitz # pyMuPDF库 +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + + #--------- 通过json_from_DocXchain来获取 header ---------# + header_bbox_from_DocXChain = [] + + xf_json = json_from_DocXchain_obj + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page) + + # {0: 'title', # 标题 + # 1: 'figure', # 图片 + # 2: 'plain text', # 文本 + # 3: 'header', # 页眉 + # 4: 'page number', # 页码 + # 5: 'footnote', # 脚注 + # 6: 'footer', # 页脚 + # 7: 'table', # 表格 + # 8: 'table caption', # 表格描述 + # 9: 'figure caption', # 图片描述 + # 10: 'equation', # 公式 + # 11: 'full column', # 单栏 + # 12: 'sub column', # 多栏 + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + for xf in xf_json['layout_dets']: + L = xf['poly'][0] / horizontal_scale_ratio + U = xf['poly'][1] / vertical_scale_ratio + R = xf['poly'][2] / horizontal_scale_ratio + D = xf['poly'][5] / vertical_scale_ratio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + if xf['category_id'] == 3 and xf['score'] >= 0.3: + header_bbox_from_DocXChain.append((L, U, R, D)) + + + header_final_names = [] + header_final_bboxs = [] + header_ID = 0 + for L, U, R, D in header_bbox_from_DocXChain: + # cur_header = page.get_pixmap(clip=(L,U,R,D)) + new_header_name = "header_{}_{}.png".format(page_ID, header_ID) # 页眉name + # cur_header.save(res_dir_path + '/' + new_header_name) # 把页眉存储在新建的文件夹,并命名 + header_final_names.append(new_header_name) # 把页面的名字存在list中 + header_final_bboxs.append((L, U, R, D)) + header_ID += 1 + + + header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + curPage_all_header_bboxs = header_final_bboxs + return curPage_all_header_bboxs + diff --git a/magic_pdf/pre_proc/detect_images.py b/magic_pdf/pre_proc/detect_images.py new file mode 100644 index 0000000000000000000000000000000000000000..fe72f60c3cc3241d764adb0df886cf218c8dd0fc --- /dev/null +++ b/magic_pdf/pre_proc/detect_images.py @@ -0,0 +1,647 @@ +import collections # 统计库 +import re +from magic_pdf.libs.commons import fitz # pyMuPDF库 + + +#--------------------------------------- Tool Functions --------------------------------------# +# 正则化,输入文本,输出只保留a-z,A-Z,0-9 +def remove_special_chars(s: str) -> str: + pattern = r"[^a-zA-Z0-9]" + res = re.sub(pattern, "", s) + return res + +def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: + # 判断rect1和rect2是否一模一样 + return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2 + +def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: + # 判断rect1包含了rect2 + return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1) + +def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool: + # 判断rect1与rect2是否存在重叠(只有一条边重叠,也算重叠) + return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2) + +def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float): + # 计算两个rect,重叠面积各占2个rect面积的比例 + if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2): + return 0, 0 + square_1 = (R1 - L1) * (D1 - U1) + square_2 = (R2 - L2) * (D2 - U2) + if square_1 == 0 or square_2 == 0: + return 0, 0 + square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2)) + return square_overlap / square_1, square_overlap / square_2 + +def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float): + # 计算两个line,重叠区间各占2个line长度的比例 + if max(L1, L2) > min(R1, R2): + return 0, 0 + if L1 == R1 or L2 == R2: + return 0, 0 + overlap_line = min(R1, R2) - max(L1, L2) + return overlap_line / (R1 - L1), overlap_line / (R2 - L2) + + +# 判断rect其实是一条line +def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool: + width = R - L + height = D - U + if width <= 3 or height <= 3: + return True + if width / height >= 30 or height / width >= 30: + return True + + + +def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + #### 通过fitz获取page信息 + ## 超越边界 + DPI = 72 # use this resolution + pix = page.get_pixmap(dpi=DPI) + pageL = 0 + pageR = int(pix.w) + pageU = 0 + pageD = int(pix.h) + + #----------------- 保存每一个文本块的LURD ------------------# + textLine_blocks = [] + blocks = page.get_text( + "dict", + flags=fitz.TEXTFLAGS_TEXT, + #clip=clip, + )["blocks"] + for i in range(len(blocks)): + bbox = blocks[i]['bbox'] + # print(bbox) + for tt in blocks[i]['lines']: + # 当前line + cur_line_bbox = None # 当前line,最右侧的section的bbox + for xf in tt['spans']: + L, U, R, D = xf['bbox'] + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + textLine_blocks.append((L, U, R, D)) + textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0])) + + + #---------------------------------------------- 保存img --------------------------------------------------# + raw_imgs = page.get_images() # 获取所有的图片 + imgs = [] + img_names = [] # 保存图片的名字,方便在md中插入引用 + img_bboxs = [] # 保存图片的location信息。 + img_visited = [] # 记忆化,记录该图片是否在md中已经插入过了 + img_ID = 0 + + ## 获取、保存每张img的location信息(x1, y1, x2, y2, UL, DR坐标) + for i in range(len(raw_imgs)): + # 如果图片在junklist中则跳过 + if raw_imgs[i][0] in junk_img_bojids: + continue + else: + try: + tt = page.get_image_rects(raw_imgs[i][0], transform = True) + + rec = tt[0][0] + L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3]) + + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD): + continue + if pageL == L and R == pageR: + continue + if pageU == U and D == pageD: + continue + # pix1 = page.get_Pixmap(clip=(L,U,R,D)) + new_img_name = "{}_{}.png".format(page_ID, i) # 图片name + # pix1.save(res_dir_path + '/' + new_img_name) # 把图片存出在新建的文件夹,并命名 + img_names.append(new_img_name) + img_bboxs.append((L, U, R, D)) + img_visited.append(False) + imgs.append(raw_imgs[i]) + except: + continue + + #-------- 如果img之间有重叠。说明获取的img大小有问题,位置也不一定对。就扔掉--------# + imgs_ok = [True for _ in range(len(imgs))] + for i in range(len(imgs)): + L1, U1, R1, D1 = img_bboxs[i] + for j in range(i + 1, len(imgs)): + L2, U2, R2, D2 = img_bboxs[j] + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + s1 = abs(R1 - L1) * abs(D1 - U1) + s2 = abs(R2 - L2) * abs(D2 - U2) + if ratio_1 > 0 and ratio_2 > 0: + if ratio_1 == 1 and ratio_2 > 0.8: + imgs_ok[i] = False + elif ratio_1 > 0.8 and ratio_2 == 1: + imgs_ok[j] = False + elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4: + imgs_ok[i] = False + imgs_ok[j] = False + elif s1 / s2 > 5 and ratio_2 > 0.5: + imgs_ok[j] = False + elif s2 / s1 > 5 and ratio_1 > 0.5: + imgs_ok[i] = False + + imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True] + img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True] + img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True] + img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True] + #*******************************************************************************# + + #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------# + # + svgs = page.get_drawings() + #------------ preprocess, check一些大框,看是否是合理的 ----------# + ## 去重。有时候会遇到rect1和rect2是完全一样的情形。 + svg_rect_visited = set() + available_svgIdx = [] + for i in range(len(svgs)): + L, U, R, D = svgs[i]['rect'].irect + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + tt = (L, U, R, D) + if tt not in svg_rect_visited: + svg_rect_visited.add(tt) + available_svgIdx.append(i) + + svgs = [svgs[i] for i in available_svgIdx] # 去重后,有效的svgs + svg_childs = [[] for _ in range(len(svgs))] + svg_parents = [[] for _ in range(len(svgs))] + svg_overlaps = [[] for _ in range(len(svgs))] #svg_overlaps[i]是一个list,存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9] + svg_visited = [False for _ in range(len(svgs))] + svg_exceedPage = [0 for _ in range(len(svgs))] # 是否超越边界(artbox),很大,但一般是一个svg的底。 + + + for i in range(len(svgs)): + L, U, R, D = svgs[i]['rect'].irect + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD) + if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20): + if ratio_2 >= 0.7: + svg_exceedPage[i] += 4 + else: + if L <= pageL: + svg_exceedPage[i] += 1 + if pageR <= R: + svg_exceedPage[i] += 1 + if U <= pageU: + svg_exceedPage[i] += 1 + if pageD <= D: + svg_exceedPage[i] += 1 + + #### 如果有≥2个的超边界的框,就不要手写规则判断svg了。很难写对。 + if len([x for x in svg_exceedPage if x >= 1]) >= 2: + svgs = [] + svg_childs = [] + svg_parents = [] + svg_overlaps = [] + svg_visited = [] + svg_exceedPage = [] + + #---------------------------- build graph ----------------------------# + for i, p in enumerate(svgs): + L1, U1, R1, D1 = svgs[i]["rect"].irect + for j in range(len(svgs)): + if i == j: + continue + L2, U2, R2, D2 = svgs[j]["rect"].irect + ## 包含 + if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + svg_childs[i].append(j) + svg_parents[j].append(i) + else: + ## 交叉 + if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + svg_overlaps[i].append(j) + + #---------------- 确定最终的svg。连通块儿的外围 -------------------# + eps_ERROR = 5 # 给识别出的svg,四周留白(为了防止pyMuPDF的rect不准) + svg_ID = 0 + svg_final_names = [] + svg_final_bboxs = [] + svg_final_visited = [] # 为下面,text识别左准备。作用同img_visited + + svg_idxs = [i for i in range(len(svgs))] + svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1])) # 按照面积,从大到小排序 + + for i in svg_idxs: + if svg_visited[i] == True: + continue + svg_visited[i] = True + L, U, R, D = svgs[i]['rect'].irect + width = R - L + height = D - U + if check_rect_isLine(L, U, R, D) == True: + svg_visited[i] = False + continue + # if i == 4: + # print(i, L, U, R, D) + # print(svg_parents[i]) + + cur_block_element_cnt = 0 # 当前要判定为svg的区域中,有多少elements,最外围的最大svg框除外。 + if len(svg_parents[i]) == 0: + ## 是个普通框的情形 + cur_block_element_cnt += len(svg_childs[i]) + if svg_exceedPage[i] == 0: + ## 误差。可能已经包含在某个框里面了 + neglect_flag = False + for pL, pU, pR, pD in svg_final_bboxs: + if pL <= L <= R <= pR and pU <= U <= D <= pD: + neglect_flag = True + break + if neglect_flag == True: + continue + + ## 搜索连通域, bfs+记忆化 + q = collections.deque() + for j in svg_overlaps[i]: + q.append(j) + while q: + j = q.popleft() + svg_visited[j] = True + L2, U2, R2, D2 = svgs[j]['rect'].irect + # width2 = R2 - L2 + # height2 = D2 - U2 + # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30: + # continue + L = min(L, L2) + R = max(R, R2) + U = min(U, U2) + D = max(D, D2) + cur_block_element_cnt += 1 + cur_block_element_cnt += len(svg_childs[j]) + for k in svg_overlaps[j]: + if svg_visited[k] == False and svg_exceedPage[k] == 0: + svg_visited[k] = True + q.append(k) + elif svg_exceedPage[i] <= 2: + ## 误差。可能已经包含在某个svg_final_bbox框里面了 + neglect_flag = False + for sL, sU, sR, sD in svg_final_bboxs: + if sL <= L <= R <= sR and sU <= U <= D <= sD: + neglect_flag = True + break + if neglect_flag == True: + continue + + L, U, R, D = pageR, pageD, pageL, pageU + ## 所有孩子元素的最大边界 + for j in svg_childs[i]: + if svg_visited[j] == True: + continue + if svg_exceedPage[j] >= 1: + continue + svg_visited[j] = True #### 这个位置考虑一下 + L2, U2, R2, D2 = svgs[j]['rect'].irect + L = min(L, L2) + R = max(R, R2) + U = min(U, U2) + D = max(D, D2) + cur_block_element_cnt += 1 + + # 如果是条line,就不用保存了 + if check_rect_isLine(L, U, R, D) == True: + continue + # 如果当前的svg,连2个elements都没有,就不用保存了 + if cur_block_element_cnt < 3: + continue + + ## 当前svg,框住了多少文本框。如果框多了,可能就是错了 + contain_textLineBlock_cnt = 0 + for L2, U2, R2, D2 in textLine_blocks: + if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True: + contain_textLineBlock_cnt += 1 + if contain_textLineBlock_cnt >= 10: + continue + + # L -= eps_ERROR * 2 + # U -= eps_ERROR + # R += eps_ERROR * 2 + # D += eps_ERROR + # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True) + # cur_svg = page.get_pixmap(clip=(L,U,R,D)) + new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID) # 图片name + # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名 + svg_final_names.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用 + svg_final_bboxs.append((L, U, R, D)) + svg_final_visited.append(False) + svg_ID += 1 + + ## 识别出的svg,可能有 包含,相邻的情形。需要进一步合并 + svg_idxs = [i for i in range(len(svg_final_bboxs))] + svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0])) # (U, L) + svg_final_names_2 = [] + svg_final_bboxs_2 = [] + svg_final_visited_2 = [] # 为下面,text识别左准备。作用同img_visited + svg_ID_2 = 0 + for i in range(len(svg_final_bboxs)): + L1, U1, R1, D1 = svg_final_bboxs[i] + for j in range(i + 1, len(svg_final_bboxs)): + L2, U2, R2, D2 = svg_final_bboxs[j] + # 如果 rect1包含了rect2 + if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + svg_final_visited[j] = True + continue + # 水平并列 + ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2) + if ratio_1 >= 0.7 and ratio_2 >= 0.7: + if abs(L2 - R1) >= 20: + continue + LL = min(L1, L2) + UU = min(U1, U2) + RR = max(R1, R2) + DD = max(D1, D2) + svg_final_bboxs[i] = (LL, UU, RR, DD) + svg_final_visited[j] = True + continue + # 竖直并列 + ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2) + if ratio_1 >= 0.7 and ratio_2 >= 0.7: + if abs(U2 - D1) >= 20: + continue + LL = min(L1, L2) + UU = min(U1, U2) + RR = max(R1, R2) + DD = max(D1, D2) + svg_final_bboxs[i] = (LL, UU, RR, DD) + svg_final_visited[j] = True + + for i in range(len(svg_final_bboxs)): + if svg_final_visited[i] == False: + L, U, R, D = svg_final_bboxs[i] + svg_final_bboxs_2.append((L, U, R, D)) + + L -= eps_ERROR * 2 + U -= eps_ERROR + R += eps_ERROR * 2 + D += eps_ERROR + # cur_svg = page.get_pixmap(clip=(L,U,R,D)) + new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2) # 图片name + # cur_svg.save(res_dir_path + '/' + new_svg_name) # 把图片存出在新建的文件夹,并命名 + svg_final_names_2.append(new_svg_name) # 把图片的名字存在list中,方便在md中插入引用 + svg_final_bboxs_2.append((L, U, R, D)) + svg_final_visited_2.append(False) + svg_ID_2 += 1 + + ## svg收尾。识别为drawing,但是在上面没有拼成一张图的。 + # 有收尾才comprehensive + # xxxx + # xxxx + # xxxx + # xxxx + + + #--------- 通过json_from_DocXchain来获取,figure, table, equation的bbox ---------# + figure_bbox_from_DocXChain = [] + + figure_from_DocXChain_visited = [] # 记忆化 + figure_bbox_from_DocXChain_overlappedRatio = [] + + figure_only_from_DocXChain_bboxs = [] # 存储 + figure_only_from_DocXChain_names = [] + figure_only_from_DocXChain_visited = [] + figure_only_ID = 0 + + xf_json = json_from_DocXchain_obj + width_from_json = xf_json['page_info']['width'] + height_from_json = xf_json['page_info']['height'] + LR_scaleRatio = width_from_json / (pageR - pageL) + UD_scaleRatio = height_from_json / (pageD - pageU) + + for xf in xf_json['layout_dets']: + # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'} + L = xf['poly'][0] / LR_scaleRatio + U = xf['poly'][1] / UD_scaleRatio + R = xf['poly'][2] / LR_scaleRatio + D = xf['poly'][5] / UD_scaleRatio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + # figure + if xf["category_id"] == 1 and xf['score'] >= 0.3: + figure_bbox_from_DocXChain.append((L, U, R, D)) + figure_from_DocXChain_visited.append(False) + figure_bbox_from_DocXChain_overlappedRatio.append(0.0) + + #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------# + + ## 比对imgs + for i, b1 in enumerate(figure_bbox_from_DocXChain): + # print('--------- DocXChain的图片', b1) + L1, U1, R1, D1 = b1 + for b2 in img_bboxs: + # print('-------- igms得到的图', b2) + L2, U2, R2, D2 = b2 + s1 = abs(R1 - L1) * abs(D1 - U1) + s2 = abs(R2 - L2) * abs(D2 - U2) + # 相同 + if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + figure_from_DocXChain_visited[i] = True + # 包含 + elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + if s2 / s1 > 0.8: + figure_from_DocXChain_visited[i] = True + elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: + if s1 / s2 > 0.8: + figure_from_DocXChain_visited[i] = True + else: + # 重叠了相当一部分 + # print('进入第3部分') + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8): + figure_from_DocXChain_visited[i] = True + else: + figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1 + # print('图片的重叠率是{}'.format(ratio_1)) + + + ## 比对svgs + svg_final_bboxs_2_badIdxs = [] + for i, b1 in enumerate(figure_bbox_from_DocXChain): + L1, U1, R1, D1 = b1 + for j, b2 in enumerate(svg_final_bboxs_2): + L2, U2, R2, D2 = b2 + s1 = abs(R1 - L1) * abs(D1 - U1) + s2 = abs(R2 - L2) * abs(D2 - U2) + # 相同 + if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + figure_from_DocXChain_visited[i] = True + # 包含 + elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True: + figure_from_DocXChain_visited[i] = True + elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: + if s1 / s2 > 0.7: + figure_from_DocXChain_visited[i] = True + else: + svg_final_bboxs_2_badIdxs.append(j) # svg丢弃。用DocXChain的结果。 + else: + # 重叠了相当一部分 + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6): + figure_from_DocXChain_visited[i] = True + else: + figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1 + + # 丢掉错误的svg + svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)] + + for i in range(len(figure_from_DocXChain_visited)): + if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7: + figure_from_DocXChain_visited[i] = True + + # DocXChain识别出来的figure,但是没被保存的。 + for i in range(len(figure_from_DocXChain_visited)): + if figure_from_DocXChain_visited[i] == False: + figure_from_DocXChain_visited[i] = True + cur_bbox = figure_bbox_from_DocXChain[i] + # cur_figure = page.get_pixmap(clip=cur_bbox) + new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID) # 图片name + # cur_figure.save(res_dir_path + '/' + new_figure_name) # 把图片存出在新建的文件夹,并命名 + figure_only_from_DocXChain_names.append(new_figure_name) # 把图片的名字存在list中,方便在md中插入引用 + figure_only_from_DocXChain_bboxs.append(cur_bbox) + figure_only_from_DocXChain_visited.append(False) + figure_only_ID += 1 + + img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0])) + figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs + + #--------------------------- 最后统一去重 -----------------------------------# + curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) ) + + #### 先考虑包含关系的小块 + final_duplicate = set() + for i in range(len(curPage_all_fig_bboxs)): + L1, U1, R1, D1 = curPage_all_fig_bboxs[i] + for j in range(len(curPage_all_fig_bboxs)): + if i == j: + continue + L2, U2, R2, D2 = curPage_all_fig_bboxs[j] + s1 = abs(R1 - L1) * abs(D1 - U1) + s2 = abs(R2 - L2) * abs(D2 - U2) + if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True: + final_duplicate.add((L1, U1, R1, D1)) + else: + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + if ratio_1 >= 0.8 and ratio_2 <= 0.6: + final_duplicate.add((L1, U1, R1, D1)) + + curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate] + + #### 再考虑重叠关系的块 + final_duplicate = set() + final_synthetic_bboxs = [] + for i in range(len(curPage_all_fig_bboxs)): + L1, U1, R1, D1 = curPage_all_fig_bboxs[i] + for j in range(len(curPage_all_fig_bboxs)): + if i == j: + continue + L2, U2, R2, D2 = curPage_all_fig_bboxs[j] + s1 = abs(R1 - L1) * abs(D1 - U1) + s2 = abs(R2 - L2) * abs(D2 - U2) + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + union_ok = False + if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): + union_ok = True + if (ratio_1 > 0.2 and s2 / s1 > 5): + union_ok = True + if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): + union_ok = True + if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): + union_ok = True + if union_ok == True: + final_duplicate.add((L1, U1, R1, D1)) + final_duplicate.add((L2, U2, R2, D2)) + L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2) + final_synthetic_bboxs.append((L3, U3, R3, D3)) + + # print('---------- curPage_all_fig_bboxs ---------') + # print(curPage_all_fig_bboxs) + curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate] + final_synthetic_bboxs = list(set(final_synthetic_bboxs)) + + + ## 再再考虑重叠关系。极端情况下会迭代式地2进1 + new_images = [] + droped_img_idx = [] + image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs] + for i in range(0, len(image_bboxes)): + for j in range(i+1, len(image_bboxes)): + if j not in droped_img_idx: + L2, U2, R2, D2 = image_bboxes[j] + s1 = abs(R1 - L1) * abs(D1 - U1) + s2 = abs(R2 - L2) * abs(D2 - U2) + ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + union_ok = False + if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): + union_ok = True + if (ratio_1 > 0.2 and s2 / s1 > 5): + union_ok = True + if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): + union_ok = True + if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): + union_ok = True + if union_ok == True: + # 合并 + image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3]) + droped_img_idx.append(j) + + for i in range(0, len(image_bboxes)): + if i not in droped_img_idx: + new_images.append(image_bboxes[i]) + + + # find_union_FLAG = True + # while find_union_FLAG == True: + # find_union_FLAG = False + # final_duplicate = set() + # tmp = [] + # for i in range(len(final_synthetic_bboxs)): + # L1, U1, R1, D1 = final_synthetic_bboxs[i] + # for j in range(len(final_synthetic_bboxs)): + # if i == j: + # continue + # L2, U2, R2, D2 = final_synthetic_bboxs[j] + # s1 = abs(R1 - L1) * abs(D1 - U1) + # s2 = abs(R2 - L2) * abs(D2 - U2) + # ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2) + # union_ok = False + # if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): + # union_ok = True + # if (ratio_1 > 0.2 and s2 / s1 > 5): + # union_ok = True + # if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1): + # union_ok = True + # if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2): + # union_ok = True + # if union_ok == True: + # find_union_FLAG = True + # final_duplicate.add((L1, U1, R1, D1)) + # final_duplicate.add((L2, U2, R2, D2)) + # L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2) + # tmp.append((L3, U3, R3, D3)) + # if find_union_FLAG == True: + # tmp = list(set(tmp)) + # final_synthetic_bboxs = tmp[:] + + + # curPage_all_fig_bboxs += final_synthetic_bboxs + # print('--------- final synthetic') + # print(final_synthetic_bboxs) + #**************************************************************************# + images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs] + images = images1 + new_images + return images + diff --git a/magic_pdf/pre_proc/detect_page_number.py b/magic_pdf/pre_proc/detect_page_number.py new file mode 100644 index 0000000000000000000000000000000000000000..35920a99b00222cc93e9566ef131a6553a3cd38b --- /dev/null +++ b/magic_pdf/pre_proc/detect_page_number.py @@ -0,0 +1,64 @@ +from magic_pdf.libs.commons import fitz # pyMuPDF库 +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + + #--------- 通过json_from_DocXchain来获取 pageNo ---------# + pageNo_bbox_from_DocXChain = [] + + xf_json = json_from_DocXchain_obj + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page) + + # {0: 'title', # 标题 + # 1: 'figure', # 图片 + # 2: 'plain text', # 文本 + # 3: 'header', # 页眉 + # 4: 'page number', # 页码 + # 5: 'footnote', # 脚注 + # 6: 'footer', # 页脚 + # 7: 'table', # 表格 + # 8: 'table caption', # 表格描述 + # 9: 'figure caption', # 图片描述 + # 10: 'equation', # 公式 + # 11: 'full column', # 单栏 + # 12: 'sub column', # 多栏 + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + for xf in xf_json['layout_dets']: + L = xf['poly'][0] / horizontal_scale_ratio + U = xf['poly'][1] / vertical_scale_ratio + R = xf['poly'][2] / horizontal_scale_ratio + D = xf['poly'][5] / vertical_scale_ratio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + if xf['category_id'] == 4 and xf['score'] >= 0.3: + pageNo_bbox_from_DocXChain.append((L, U, R, D)) + + + pageNo_final_names = [] + pageNo_final_bboxs = [] + pageNo_ID = 0 + for L, U, R, D in pageNo_bbox_from_DocXChain: + # cur_pageNo = page.get_pixmap(clip=(L,U,R,D)) + new_pageNo_name = "pageNo_{}_{}.png".format(page_ID, pageNo_ID) # 页码name + # cur_pageNo.save(res_dir_path + '/' + new_pageNo_name) # 把页码存储在新建的文件夹,并命名 + pageNo_final_names.append(new_pageNo_name) # 把页码的名字存在list中 + pageNo_final_bboxs.append((L, U, R, D)) + pageNo_ID += 1 + + + pageNo_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + curPage_all_pageNo_bboxs = pageNo_final_bboxs + return curPage_all_pageNo_bboxs + diff --git a/magic_pdf/pre_proc/detect_tables.py b/magic_pdf/pre_proc/detect_tables.py new file mode 100644 index 0000000000000000000000000000000000000000..fc2992eecbf64663992d80f95a8d7785ca710ded --- /dev/null +++ b/magic_pdf/pre_proc/detect_tables.py @@ -0,0 +1,62 @@ +from magic_pdf.libs.commons import fitz # pyMuPDF库 + + +def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict): + """ + :param page_ID: int类型,当前page在当前pdf文档中是第page_D页。 + :param page :fitz读取的当前页的内容 + :param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir + :param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict + """ + DPI = 72 # use this resolution + pix = page.get_pixmap(dpi=DPI) + pageL = 0 + pageR = int(pix.w) + pageU = 0 + pageD = int(pix.h) + + + #--------- 通过json_from_DocXchain来获取 table ---------# + table_bbox_from_DocXChain = [] + + xf_json = json_from_DocXchain_obj + width_from_json = xf_json['page_info']['width'] + height_from_json = xf_json['page_info']['height'] + LR_scaleRatio = width_from_json / (pageR - pageL) + UD_scaleRatio = height_from_json / (pageD - pageU) + + + for xf in xf_json['layout_dets']: + # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'} + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + L = xf['poly'][0] / LR_scaleRatio + U = xf['poly'][1] / UD_scaleRatio + R = xf['poly'][2] / LR_scaleRatio + D = xf['poly'][5] / UD_scaleRatio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + if xf['category_id'] == 7 and xf['score'] >= 0.3: + table_bbox_from_DocXChain.append((L, U, R, D)) + + + table_final_names = [] + table_final_bboxs = [] + table_ID = 0 + for L, U, R, D in table_bbox_from_DocXChain: + # cur_table = page.get_pixmap(clip=(L,U,R,D)) + new_table_name = "table_{}_{}.png".format(page_ID, table_ID) # 表格name + # cur_table.save(res_dir_path + '/' + new_table_name) # 把表格存出在新建的文件夹,并命名 + table_final_names.append(new_table_name) # 把表格的名字存在list中,方便在md中插入引用 + table_final_bboxs.append((L, U, R, D)) + table_ID += 1 + + + table_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0])) + curPage_all_table_bboxs = table_final_bboxs + return curPage_all_table_bboxs + diff --git a/magic_pdf/pre_proc/equations_replace.py b/magic_pdf/pre_proc/equations_replace.py new file mode 100644 index 0000000000000000000000000000000000000000..289fbf3fe3625f74b4f38b92b3bdabadfa1fd75d --- /dev/null +++ b/magic_pdf/pre_proc/equations_replace.py @@ -0,0 +1,559 @@ +""" +对pymupdf返回的结构里的公式进行替换,替换为模型识别的公式结果 +""" + +from magic_pdf.libs.commons import fitz +import json +import os +from pathlib import Path +from loguru import logger +from magic_pdf.libs.ocr_content_type import ContentType + +TYPE_INLINE_EQUATION = ContentType.InlineEquation +TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation + + +def combine_chars_to_pymudict(block_dict, char_dict): + """ + 把block级别的pymupdf 结构里加入char结构 + """ + # 因为block_dict 被裁剪过,因此先把他和char_dict文字块对齐,才能进行补充 + char_map = {tuple(item["bbox"]): item for item in char_dict} + + for i in range(len(block_dict)): # blcok + block = block_dict[i] + key = block["bbox"] + char_dict_item = char_map[tuple(key)] + char_dict_map = {tuple(item["bbox"]): item for item in char_dict_item["lines"]} + for j in range(len(block["lines"])): + lines = block["lines"][j] + with_char_lines = char_dict_map[lines["bbox"]] + for k in range(len(lines["spans"])): + spans = lines["spans"][k] + try: + chars = with_char_lines["spans"][k]["chars"] + except Exception as e: + logger.error(char_dict[i]["lines"][j]) + + spans["chars"] = chars + + return block_dict + + +def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox): + """ + 计算box1和box2的重叠面积占最小面积的box的比例 + """ + # Determine the coordinates of the intersection rectangle + x_left = max(bbox1[0], min_bbox[0]) + y_top = max(bbox1[1], min_bbox[1]) + x_right = min(bbox1[2], min_bbox[2]) + y_bottom = min(bbox1[3], min_bbox[3]) + + if x_right < x_left or y_bottom < y_top: + return 0.0 + + # The area of overlap area + intersection_area = (x_right - x_left) * (y_bottom - y_top) + min_box_area = (min_bbox[3] - min_bbox[1]) * (min_bbox[2] - min_bbox[0]) + if min_box_area == 0: + return 0 + else: + return intersection_area / min_box_area + + +def _is_xin(bbox1, bbox2): + area1 = abs(bbox1[2] - bbox1[0]) * abs(bbox1[3] - bbox1[1]) + area2 = abs(bbox2[2] - bbox2[0]) * abs(bbox2[3] - bbox2[1]) + if area1 < area2: + ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1) + else: + ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2) + + return ratio > 0.6 + + +def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks): + """消除掉整个块都在行间公式块内部的文本块""" + for eq_bbox in interline_bboxes: + removed_txt_blk = [] + for text_blk in text_blocks: + text_bbox = text_blk["bbox"] + if ( + calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], text_bbox) + >= 0.7 + ): + removed_txt_blk.append(text_blk) + for blk in removed_txt_blk: + text_blocks.remove(blk) + + return text_blocks + + +def _is_in_or_part_overlap(box1, box2) -> bool: + """ + 两个bbox是否有部分重叠或者包含 + """ + if box1 is None or box2 is None: + return False + + x0_1, y0_1, x1_1, y1_1 = box1 + x0_2, y0_2, x1_2, y1_2 = box2 + + return not ( + x1_1 < x0_2 # box1在box2的左边 + or x0_1 > x1_2 # box1在box2的右边 + or y1_1 < y0_2 # box1在box2的上边 + or y0_1 > y1_2 + ) # box1在box2的下边 + + +def remove_text_block_overlap_interline_equation_bbox( + interline_eq_bboxes, pymu_block_list +): + + """消除掉行行内公式有部分重叠的文本块的内容。 + 同时重新计算消除重叠之后文本块的大小""" + deleted_block = [] + for text_block in pymu_block_list: + deleted_line = [] + for line in text_block["lines"]: + deleted_span = [] + for span in line["spans"]: + deleted_chars = [] + for char in span["chars"]: + if any( + [ + (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5) + for eq_bbox in interline_eq_bboxes + ] + ): + deleted_chars.append(char) + # 检查span里没有char则删除这个span + for char in deleted_chars: + span["chars"].remove(char) + # 重新计算这个span的大小 + if len(span["chars"]) == 0: # 删除这个span + deleted_span.append(span) + else: + span["bbox"] = ( + min([b["bbox"][0] for b in span["chars"]]), + min([b["bbox"][1] for b in span["chars"]]), + max([b["bbox"][2] for b in span["chars"]]), + max([b["bbox"][3] for b in span["chars"]]), + ) + + # 检查这个span + for span in deleted_span: + line["spans"].remove(span) + if len(line["spans"]) == 0: # 删除这个line + deleted_line.append(line) + else: + line["bbox"] = ( + min([b["bbox"][0] for b in line["spans"]]), + min([b["bbox"][1] for b in line["spans"]]), + max([b["bbox"][2] for b in line["spans"]]), + max([b["bbox"][3] for b in line["spans"]]), + ) + + # 检查这个block是否可以删除 + for line in deleted_line: + text_block["lines"].remove(line) + if len(text_block["lines"]) == 0: # 删除block + deleted_block.append(text_block) + else: + text_block["bbox"] = ( + min([b["bbox"][0] for b in text_block["lines"]]), + min([b["bbox"][1] for b in text_block["lines"]]), + max([b["bbox"][2] for b in text_block["lines"]]), + max([b["bbox"][3] for b in text_block["lines"]]), + ) + + # 检查text block删除 + for block in deleted_block: + pymu_block_list.remove(block) + if len(pymu_block_list) == 0: + return [] + + return pymu_block_list + + +def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list): + """在行间公式对应的地方插上一个伪造的block""" + for eq in interline_eq_bboxes: + bbox = eq["bbox"] + latex_content = eq["latex"] + text_block = { + "number": len(pymu_block_list), + "type": 0, + "bbox": bbox, + "lines": [ + { + "spans": [ + { + "size": 9.962599754333496, + "type": TYPE_INTERLINE_EQUATION, + "flags": 4, + "font": TYPE_INTERLINE_EQUATION, + "color": 0, + "ascender": 0.9409999847412109, + "descender": -0.3050000071525574, + "latex": latex_content, + "origin": [bbox[0], bbox[1]], + "bbox": bbox, + } + ], + "wmode": 0, + "dir": [1.0, 0.0], + "bbox": bbox, + } + ], + } + pymu_block_list.append(text_block) + + +def x_overlap_ratio(box1, box2): + a, _, c, _ = box1 + e, _, g, _ = box2 + + # 计算重叠宽度 + overlap_x = max(min(c, g) - max(a, e), 0) + + # 计算box1的宽度 + width1 = g - e + + # 计算重叠比例 + overlap_ratio = overlap_x / width1 if width1 != 0 else 0 + + return overlap_ratio + + +def __is_x_dir_overlap(bbox1, bbox2): + return not (bbox1[2] < bbox2[0] or bbox1[0] > bbox2[2]) + + +def __y_overlap_ratio(box1, box2): + """""" + _, b, _, d = box1 + _, f, _, h = box2 + + # 计算重叠高度 + overlap_y = max(min(d, h) - max(b, f), 0) + + # 计算box1的高度 + height1 = d - b + + # 计算重叠比例 + overlap_ratio = overlap_y / height1 if height1 != 0 else 0 + + return overlap_ratio + + +def replace_line_v2(eqinfo, line): + """ + 扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。 + 最后与这个x0,x1有相交的span0, span1内部进行分割。 + """ + first_overlap_span = -1 + first_overlap_span_idx = -1 + last_overlap_span = -1 + delete_chars = [] + for i in range(0, len(line["spans"])): + if "chars" not in line["spans"][i]: + continue + + if line["spans"][i].get("_type", None) is not None: + continue # 忽略,因为已经是插入的伪造span公式了 + + for char in line["spans"][i]["chars"]: + if __is_x_dir_overlap(eqinfo["bbox"], char["bbox"]): + line_txt = "" + for span in line["spans"]: + span_txt = "" + for ch in span["chars"]: + span_txt = span_txt + ch["c"] + + span_txt = span_txt + "" + + line_txt = line_txt + span_txt + + if first_overlap_span_idx == -1: + first_overlap_span = line["spans"][i] + first_overlap_span_idx = i + last_overlap_span = line["spans"][i] + delete_chars.append(char) + + # 第一个和最后一个char要进行检查,到底属于公式多还是属于正常span多 + if len(delete_chars) > 0: + ch0_bbox = delete_chars[0]["bbox"] + if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51: + delete_chars.remove(delete_chars[0]) + if len(delete_chars) > 0: + ch0_bbox = delete_chars[-1]["bbox"] + if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51: + delete_chars.remove(delete_chars[-1]) + + # 计算x方向上被删除区间内的char的真实x0, x1 + if len(delete_chars): + x0, x1 = min([b["bbox"][0] for b in delete_chars]), max( + [b["bbox"][2] for b in delete_chars] + ) + else: + # logger.debug(f"行内公式替换没有发生,尝试下一行匹配, eqinfo={eqinfo}") + return False + + # 删除位于x0, x1这两个中间的span + delete_span = [] + for span in line["spans"]: + span_box = span["bbox"] + if x0 <= span_box[0] and span_box[2] <= x1: + delete_span.append(span) + for span in delete_span: + line["spans"].remove(span) + + equation_span = { + "size": 9.962599754333496, + "type": TYPE_INLINE_EQUATION, + "flags": 4, + "font": TYPE_INLINE_EQUATION, + "color": 0, + "ascender": 0.9409999847412109, + "descender": -0.3050000071525574, + "latex": "", + "origin": [337.1410153102337, 216.0205245153934], + "bbox": eqinfo["bbox"] + } + # equation_span = line['spans'][0].copy() + equation_span["latex"] = eqinfo['latex'] + equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]] + equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]] + equation_span["chars"] = delete_chars + equation_span["type"] = TYPE_INLINE_EQUATION + equation_span["_eq_bbox"] = eqinfo["bbox"] + line["spans"].insert(first_overlap_span_idx + 1, equation_span) # 放入公式 + + # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】") + + # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置 + first_span_chars = [ + char + for char in first_overlap_span["chars"] + if (char["bbox"][2] + char["bbox"][0]) / 2 < x0 + ] + tail_span_chars = [ + char + for char in last_overlap_span["chars"] + if (char["bbox"][0] + char["bbox"][2]) / 2 > x1 + ] + + if len(first_span_chars) > 0: + first_overlap_span["chars"] = first_span_chars + first_overlap_span["text"] = "".join([char["c"] for char in first_span_chars]) + first_overlap_span["bbox"] = ( + first_overlap_span["bbox"][0], + first_overlap_span["bbox"][1], + max([chr["bbox"][2] for chr in first_span_chars]), + first_overlap_span["bbox"][3], + ) + # first_overlap_span['_type'] = "first" + else: + # 删掉 + if first_overlap_span not in delete_span: + line["spans"].remove(first_overlap_span) + + if len(tail_span_chars) > 0: + min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars]) + min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars]) + max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars]) + max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars]) + + if last_overlap_span == first_overlap_span: # 这个时候应该插入一个新的 + tail_span_txt = "".join([char["c"] for char in tail_span_chars]) + last_span_to_insert = last_overlap_span.copy() + last_span_to_insert["chars"] = tail_span_chars + last_span_to_insert["text"] = "".join( + [char["c"] for char in tail_span_chars] + ) + if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]: + last_span_to_insert["bbox"] = ( + min_of_tail_span_x0, + min_of_tail_span_y0, + max_of_tail_span_x1, + max_of_tail_span_y1 + ) + else: + last_span_to_insert["bbox"] = ( + min([chr["bbox"][0] for chr in tail_span_chars]), + last_overlap_span["bbox"][1], + last_overlap_span["bbox"][2], + last_overlap_span["bbox"][3], + ) + # 插入到公式对象之后 + equation_idx = line["spans"].index(equation_span) + line["spans"].insert(equation_idx + 1, last_span_to_insert) # 放入公式 + else: # 直接修改原来的span + last_overlap_span["chars"] = tail_span_chars + last_overlap_span["text"] = "".join([char["c"] for char in tail_span_chars]) + last_overlap_span["bbox"] = ( + min([chr["bbox"][0] for chr in tail_span_chars]), + last_overlap_span["bbox"][1], + last_overlap_span["bbox"][2], + last_overlap_span["bbox"][3], + ) + else: + # 删掉 + if ( + last_overlap_span not in delete_span + and last_overlap_span != first_overlap_span + ): + line["spans"].remove(last_overlap_span) + + remain_txt = "" + for span in line["spans"]: + span_txt = "" + for char in span["chars"]: + span_txt = span_txt + char["c"] + + span_txt = span_txt + "" + + remain_txt = remain_txt + span_txt + + # logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】") + + return True + + +def replace_eq_blk(eqinfo, text_block): + """替换行内公式""" + for line in text_block["lines"]: + line_bbox = line["bbox"] + if ( + _is_xin(eqinfo["bbox"], line_bbox) + or __y_overlap_ratio(eqinfo["bbox"], line_bbox) > 0.6 + ): # 定位到行, 使用y方向重合率是因为有的时候,一个行的宽度会小于公式位置宽度:行很高,公式很窄, + replace_succ = replace_line_v2(eqinfo, line) + if ( + not replace_succ + ): # 有的时候,一个pdf的line高度从API里会计算的有问题,因此在行内span级别会替换不成功,这就需要继续重试下一行 + continue + else: + break + else: + return False + return True + + +def replace_inline_equations(inline_equation_bboxes, raw_text_blocks): + """替换行内公式""" + for eqinfo in inline_equation_bboxes: + eqbox = eqinfo["bbox"] + for blk in raw_text_blocks: + if _is_xin(eqbox, blk["bbox"]): + if not replace_eq_blk(eqinfo, blk): + logger.warning(f"行内公式没有替换成功:{eqinfo} ") + else: + break + + return raw_text_blocks + + +def remove_chars_in_text_blocks(text_blocks): + """删除text_blocks里的char""" + for blk in text_blocks: + for line in blk["lines"]: + for span in line["spans"]: + _ = span.pop("chars", "no such key") + return text_blocks + + +def replace_equations_in_textblock( + raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes +): + """ + 替换行间和和行内公式为latex + """ + raw_text_blocks = remove_text_block_in_interline_equation_bbox( + interline_equation_bboxes, raw_text_blocks + ) # 消除重叠:第一步,在公式内部的 + + raw_text_blocks = remove_text_block_overlap_interline_equation_bbox( + interline_equation_bboxes, raw_text_blocks + ) # 消重,第二步,和公式覆盖的 + + insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks) + raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks) + return raw_text_blocks + + +def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path): + """ """ + new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf" + with open(json_path, "r", encoding="utf-8") as f: + obj = json.loads(f.read()) + + if os.path.exists(new_pdf): + os.remove(new_pdf) + new_doc = fitz.open("") + + doc = fitz.open(pdf_path) + new_doc = fitz.open(pdf_path) + for i in range(len(new_doc)): + page = new_doc[i] + inline_equation_bboxes = obj[f"page_{i}"]["inline_equations"] + interline_equation_bboxes = obj[f"page_{i}"]["interline_equations"] + raw_text_blocks = obj[f"page_{i}"]["preproc_blocks"] + raw_text_blocks = remove_text_block_in_interline_equation_bbox( + interline_equation_bboxes, raw_text_blocks + ) # 消除重叠:第一步,在公式内部的 + raw_text_blocks = remove_text_block_overlap_interline_equation_bbox( + interline_equation_bboxes, raw_text_blocks + ) # 消重,第二步,和公式覆盖的 + insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks) + raw_text_blocks = replace_inline_equations( + inline_equation_bboxes, raw_text_blocks + ) + + # 为了检验公式是否重复,把每一行里,含有公式的span背景改成黄色的 + color_map = [fitz.pdfcolor["blue"], fitz.pdfcolor["green"]] + j = 0 + for blk in raw_text_blocks: + for i, line in enumerate(blk["lines"]): + + # line_box = line['bbox'] + # shape = page.new_shape() + # shape.draw_rect(line_box) + # shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3) + # shape.commit() + # j = j+1 + + for i, span in enumerate(line["spans"]): + shape_page = page.new_shape() + span_type = span.get("_type") + color = fitz.pdfcolor["blue"] + if span_type == "first": + color = fitz.pdfcolor["blue"] + elif span_type == "tail": + color = fitz.pdfcolor["green"] + elif span_type == TYPE_INLINE_EQUATION: + color = fitz.pdfcolor["black"] + else: + color = None + + b = span["bbox"] + shape_page.draw_rect(b) + + shape_page.finish(color=None, fill=color, fill_opacity=0.3) + shape_page.commit() + + new_doc.save(new_pdf) + logger.info(f"save ok {new_pdf}") + final_json = json.dumps(obj, ensure_ascii=False, indent=2) + with open("equations_test/final_json.json", "w") as f: + f.write(final_json) + + return new_pdf + + +if __name__ == "__main__": + # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf) + pass diff --git a/magic_pdf/pre_proc/fix_image.py b/magic_pdf/pre_proc/fix_image.py new file mode 100644 index 0000000000000000000000000000000000000000..d2f83570d6ca0f30f1c9bf0ddf90a916fdb80c91 --- /dev/null +++ b/magic_pdf/pre_proc/fix_image.py @@ -0,0 +1,244 @@ + + + +import re +from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox + +from magic_pdf.libs.textbase import get_text_block_base_info + +def fix_image_vertical(image_bboxes:list, text_blocks:list): + """ + 修正图片的位置 + 如果图片与文字block发生一定重叠(也就是图片切到了一部分文字),那么减少图片边缘,让文字和图片不再重叠。 + 只对垂直方向进行。 + """ + for image_bbox in image_bboxes: + for text_block in text_blocks: + text_bbox = text_block["bbox"] + if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]): + if text_bbox[1] < image_bbox[1]:#在图片上方 + image_bbox[1] = text_bbox[3]+1 + elif text_bbox[3]>image_bbox[3]:#在图片下方 + image_bbox[3] = text_bbox[1]-1 + + return image_bboxes + +def __merge_if_common_edge(bbox1, bbox2): + x_min_1, y_min_1, x_max_1, y_max_1 = bbox1 + x_min_2, y_min_2, x_max_2, y_max_2 = bbox2 + + # 检查是否有公共的水平边 + if y_min_1 == y_min_2 or y_max_1 == y_max_2: + # 确保一个框的x范围在另一个框的x范围内 + if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2): + return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)] + + # 检查是否有公共的垂直边 + if x_min_1 == x_min_2 or x_max_1 == x_max_2: + # 确保一个框的y范围在另一个框的y范围内 + if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2): + return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)] + + # 如果没有公共边 + return None + +def fix_seperated_image(image_bboxes:list): + """ + 如果2个图片有一个边重叠,那么合并2个图片 + """ + new_images = [] + droped_img_idx = [] + + for i in range(0, len(image_bboxes)): + for j in range(i+1, len(image_bboxes)): + new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j]) + if new_img is not None: + new_images.append(new_img) + droped_img_idx.append(i) + droped_img_idx.append(j) + break + + for i in range(0, len(image_bboxes)): + if i not in droped_img_idx: + new_images.append(image_bboxes[i]) + + return new_images + + +def __check_img_title_pattern(text): + """ + 检查文本段是否是表格的标题 + """ + patterns = [r"^(fig|figure).*", r"^(scheme).*"] + text = text.strip() + for pattern in patterns: + match = re.match(pattern, text, re.IGNORECASE) + if match: + return True + return False + +def __get_fig_caption_text(text_block): + txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans']) + line_cnt = len(text_block['lines']) + txt = txt.replace("Ž . ", '') + return txt, line_cnt + + +def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box): + """ + 继续向下方寻找和图片caption字号,字体,颜色一样的文字框,合并入caption。 + text_block是已经找到的图片catpion(这个caption可能不全,多行被划分到多个pymu block里了) + """ + combined_image_caption_text_block = list(text_block.copy()['bbox']) + base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block) + while True: + tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block) + if not tb_add: + break + tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add) + if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type: + combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0]) + combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2]) + combined_image_caption_text_block[3] = tb_add['bbox'][3] + else: + break + + image_box[0] = min(image_box[0], combined_image_caption_text_block[0]) + image_box[1] = min(image_box[1], combined_image_caption_text_block[1]) + image_box[2] = max(image_box[2], combined_image_caption_text_block[2]) + image_box[3] = max(image_box[3], combined_image_caption_text_block[3]) + text_block['_image_caption'] = True + + +def include_img_title(pymu_blocks, image_bboxes: list): + """ + 向上方和下方寻找符合图片title的文本block,合并到图片里 + 如果图片上下都有fig的情况怎么办?寻找标题距离最近的那个。 + --- + 增加对左侧和右侧图片标题的寻找 + """ + + + for tb in image_bboxes: + # 优先找下方的 + max_find_cnt = 3 # 向上,向下最多找3个就停止 + temp_box = tb.copy() + while max_find_cnt>0: + text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box) + if text_block_btn: + txt, line_cnt = __get_fig_caption_text(text_block_btn) + if len(txt.strip())>0: + if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题,或者有时候图片下方文字没有被图片识别模型放入图片里 + max_find_cnt = max_find_cnt - 1 + temp_box[3] = text_block_btn['bbox'][3] + continue + else: + break + else: + temp_box[3] = text_block_btn['bbox'][3] # 宽度不变,扩大 + max_find_cnt = max_find_cnt - 1 + else: + break + + max_find_cnt = 3 # 向上,向下最多找3个就停止 + temp_box = tb.copy() + while max_find_cnt>0: + text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box) + if text_block_top: + txt, line_cnt = __get_fig_caption_text(text_block_top) + if len(txt.strip())>0: + if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3: + max_find_cnt = max_find_cnt - 1 + temp_box[1] = text_block_top['bbox'][1] + continue + else: + break + else: + b = text_block_top['bbox'] + temp_box[1] = b[1] # 宽度不变,扩大 + max_find_cnt = max_find_cnt - 1 + else: + break + + if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False : + btn_text, _ = __get_fig_caption_text(text_block_btn) + top_text, _ = __get_fig_caption_text(text_block_top) + if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text): + # 取距离图片最近的 + btn_text_distance = text_block_btn['bbox'][1] - tb[3] + top_text_distance = tb[1] - text_block_top['bbox'][3] + if btn_text_distance= 5: + cur_line = (LL, UU, RR, DD) + res.append(cur_line) + LL = L1 + else: + RR = max(RR, R1) + cur_line = (LL, UU, RR, DD) + res.append(cur_line) + return res + +def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int): + """ + :param page :fitz读取的当前页的内容 + :param table_bboxes: list类型,每一个元素是一个元祖 (L, U, R, D) + :param include_table_title: 是否将表格的标题也圈进来 + :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题 + """ + + drawings_lines = get_merged_line(page) + fix_table_bboxes = [] + + for table in table_bboxes: + (L, U, R, D) = table + fix_table_L = [] + fix_table_U = [] + fix_table_R = [] + fix_table_D = [] + width = R - L + width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线 + height = D - U + height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线 + for line in drawings_lines: + if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度 + if (U - height_range) < line[1] < (U + height_range): # 上边界,在一定的高度范围内 + fix_table_U.append(line[1]) + fix_table_L.append(line[0]) + fix_table_R.append(line[2]) + elif (D - height_range) < line[1] < (D + height_range): # 下边界,在一定的高度范围内 + fix_table_D.append(line[1]) + fix_table_L.append(line[0]) + fix_table_R.append(line[2]) + + if fix_table_U: + U = min(fix_table_U) + if fix_table_D: + D = max(fix_table_D) + if fix_table_L: + L = min(fix_table_L) + if fix_table_R: + R = max(fix_table_R) + + if include_table_title: # 需要将表格标题包括 + text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"] # 所有的text的block + incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))] # 将与表格完全没有任何遮挡的文字筛除掉(比如另一栏的文字) + upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0] # 将在表格线以上的text block筛选出来 + sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序,如果是同一个高度,则先左再右 + + for idx in range(scan_line_num): + if idx+1 <= len(sorted_filtered_text_blocks): + line_temp = sorted_filtered_text_blocks[idx]['lines'] + if line_temp: + text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容 + check_en = re.match('Table', text) # 检查是否有Table开头的(英文) + check_ch = re.match('表', text) # 检查是否有Table开头的(中文) + if check_en or check_ch: + if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox + U = sorted_filtered_text_blocks[idx]['bbox'][1] + + fix_table_bboxes.append([L-2, U-2, R+2, D+2]) + + return fix_table_bboxes + +def __check_table_title_pattern(text): + """ + 检查文本段是否是表格的标题 + """ + patterns = [r'^table\s\d+'] + + for pattern in patterns: + match = re.match(pattern, text, re.IGNORECASE) + if match: + return True + else: + return False + + +def fix_table_text_block(pymu_blocks, table_bboxes: list): + """ + 调整table, 如果table和上下的text block有相交区域,则将table的上下边界调整到text block的上下边界 + 例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf + """ + for tb in table_bboxes: + (L, U, R, D) = tb + for block in pymu_blocks: + if _is_in_or_part_overlap((L, U, R, D), block['bbox']): + txt = " ".join(span['text'] for line in block['lines'] for span in line['spans']) + if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title,那么不调整。因为下一步会统一调整,如果这里进行了调整,后面的调整会造成调整到其他table的title上(在连续出现2个table的情况下)。 + tb[0] = min(tb[0], block['bbox'][0]) + tb[1] = min(tb[1], block['bbox'][1]) + tb[2] = max(tb[2], block['bbox'][2]) + tb[3] = max(tb[3], block['bbox'][3]) + block['_table'] = True # 占位,防止其他table再次占用 + + """如果是个table的title,但是有部分重叠,那么修正这个title,使得和table不重叠""" + if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt): + block['bbox'] = list(block['bbox']) + if block['bbox'][3] > U: + block['bbox'][3] = U-1 + if block['bbox'][1] < D: + block['bbox'][1] = D+1 + + + return table_bboxes + + +def __get_table_caption_text(text_block): + txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans']) + line_cnt = len(text_block['lines']) + txt = txt.replace("Ž . ", '') + return txt, line_cnt + + +def include_table_title(pymu_blocks, table_bboxes: list): + """ + 把表格的title也包含进来,扩展到table_bbox上 + """ + for tb in table_bboxes: + max_find_cnt = 3 # 上上最多找3次 + temp_box = tb.copy() + while max_find_cnt>0: + text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box) + if text_block_top: + txt, line_cnt = __get_table_caption_text(text_block_top) + if len(txt.strip())>0: + if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: + max_find_cnt = max_find_cnt -1 + temp_box[1] = text_block_top['bbox'][1] + continue + else: + break + else: + temp_box[1] = text_block_top['bbox'][1] # 宽度不变,扩大 + max_find_cnt = max_find_cnt - 1 + else: + break + + max_find_cnt = 3 # 向下找 + temp_box = tb.copy() + while max_find_cnt>0: + text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box) + if text_block_bottom: + txt, line_cnt = __get_table_caption_text(text_block_bottom) + if len(txt.strip())>0: + if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: + max_find_cnt = max_find_cnt - 1 + temp_box[3] = text_block_bottom['bbox'][3] + continue + else: + break + else: + temp_box[3] = text_block_bottom['bbox'][3] + max_find_cnt = max_find_cnt - 1 + else: + break + + if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False : + btn_text, _ = __get_table_caption_text(text_block_bottom) + top_text, _ = __get_table_caption_text(text_block_top) + if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption + # 取距离最近的 + btn_text_distance = text_block_bottom['bbox'][1] - tb[3] + top_text_distance = tb[1] - text_block_top['bbox'][3] + text_block = text_block_bottom if btn_text_distance 0] + if span_font: + # main_text_font应该用基于字数最多的字体而不是span级别的统计 + # font_names.append(font_name for font_name in span_font) + # block_fonts.append(font_name for font_name in span_font) + for font, count in span_font: + font_names[font] += count + main_text_font = font_names.most_common(1)[0][0] + return main_text_font + diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py new file mode 100644 index 0000000000000000000000000000000000000000..9f07276de36175fc0637f64522f88af4e3cb90c4 --- /dev/null +++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py @@ -0,0 +1,115 @@ +from loguru import logger + +from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \ + calculate_iou +from magic_pdf.libs.drop_tag import DropTag +from magic_pdf.libs.ocr_content_type import BlockType +from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block + + +def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks, + title_blocks, interline_equation_blocks, page_w, page_h): + all_bboxes = [] + all_discarded_blocks = [] + for image in img_blocks: + x0, y0, x1, y1 = image['bbox'] + all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]]) + + for table in table_blocks: + x0, y0, x1, y1 = table['bbox'] + all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]]) + + for text in text_blocks: + x0, y0, x1, y1 = text['bbox'] + all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]]) + + for title in title_blocks: + x0, y0, x1, y1 = title['bbox'] + all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]]) + + for interline_equation in interline_equation_blocks: + x0, y0, x1, y1 = interline_equation['bbox'] + all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]]) + + '''block嵌套问题解决''' + '''文本框与标题框重叠,优先信任文本框''' + all_bboxes = fix_text_overlap_title_blocks(all_bboxes) + '''任何框体与舍弃框重叠,优先信任舍弃框''' + all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks) + # @todo interline_equation 与title或text框冲突的情况,分两种情况处理 + '''interline_equation框与文本类型框iou比较接近1的时候,信任行间公式框''' + '''interline_equation框被包含在文本类型框内,且interline_equation比文本区块小很多时信任文本框,这时需要舍弃公式框''' + + '''discarded_blocks中只保留宽度超过1/3页面宽度的,高度超过10的,处于页面下半50%区域的(限定footnote)''' + for discarded in discarded_blocks: + x0, y0, x1, y1 = discarded['bbox'] + all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]]) + # 将footnote加入到all_bboxes中,用来计算layout + if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2): + all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]]) + + '''经过以上处理后,还存在大框套小框的情况,则删除小框''' + all_bboxes = remove_overlaps_min_blocks(all_bboxes) + all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) + '''将剩余的bbox做分离处理,防止后面分layout时出错''' + all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) + + return all_bboxes, all_discarded_blocks, drop_reasons + + +def fix_text_overlap_title_blocks(all_bboxes): + # 先提取所有text和title block + text_blocks = [] + for block in all_bboxes: + if block[7] == BlockType.Text: + text_blocks.append(block) + title_blocks = [] + for block in all_bboxes: + if block[7] == BlockType.Title: + title_blocks.append(block) + + for text_block in text_blocks: + for title_block in title_blocks: + text_block_bbox = text_block[:4] + title_block_bbox = title_block[:4] + if calculate_iou(text_block_bbox, title_block_bbox) > 0.8: + all_bboxes.remove(title_block) + + return all_bboxes + + +def remove_need_drop_blocks(all_bboxes, discarded_blocks): + need_remove = [] + for block in all_bboxes: + for discarded_block in discarded_blocks: + block_bbox = block[:4] + if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6: + if block not in need_remove: + need_remove.append(block) + break + + if len(need_remove) > 0: + for block in need_remove: + all_bboxes.remove(block) + return all_bboxes + + +def remove_overlaps_min_blocks(all_bboxes): + # 删除重叠blocks中较小的那些 + need_remove = [] + for block1 in all_bboxes: + for block2 in all_bboxes: + if block1 != block2: + block1_bbox = block1[:4] + block2_bbox = block2[:4] + overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8) + if overlap_box is not None: + bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None) + if bbox_to_remove is not None and bbox_to_remove not in need_remove: + need_remove.append(bbox_to_remove) + + if len(need_remove) > 0: + for block in need_remove: + all_bboxes.remove(block) + + return all_bboxes diff --git a/magic_pdf/pre_proc/ocr_detect_layout.py b/magic_pdf/pre_proc/ocr_detect_layout.py new file mode 100644 index 0000000000000000000000000000000000000000..4dad3593d69696b65b5650e9b9dbadd33c11b595 --- /dev/null +++ b/magic_pdf/pre_proc/ocr_detect_layout.py @@ -0,0 +1,133 @@ +import fitz + +from magic_pdf.layout.layout_sort import get_bboxes_layout +from magic_pdf.libs.boxbase import _is_part_overlap, _is_in +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def get_center_point(bbox): + """ + 根据边界框坐标信息,计算出该边界框的中心点坐标。 + Args: + bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。 + Returns: + list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。 + """ + return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2] + + +def get_area(bbox): + """ + 根据边界框坐标信息,计算出该边界框的面积。 + Args: + bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。 + Returns: + float: 该边界框的面积。 + """ + return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) + + +def adjust_layouts(layout_bboxes, page_boundry, page_id): + # 遍历所有布局框 + for i in range(len(layout_bboxes)): + # 遍历当前布局框之后的布局框 + for j in range(i + 1, len(layout_bboxes)): + # 判断两个布局框是否重叠 + if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]): + # 计算每个布局框的中心点坐标和面积 + area_i = get_area(layout_bboxes[i]) + area_j = get_area(layout_bboxes[j]) + + # 较大布局框和较小布局框的赋值 + if area_i > area_j: + larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j] + else: + larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i] + + center_large = get_center_point(larger_layout) + center_small = get_center_point(smaller_layout) + # 计算横向和纵向的距离差 + distance_x = center_large[0] - center_small[0] + distance_y = center_large[1] - center_small[1] + + # 根据距离差判断重叠方向并修正边界 + if abs(distance_x) > abs(distance_y): # 左右重叠 + if distance_x > 0 and larger_layout[0] < smaller_layout[2]: + larger_layout[0] = smaller_layout[2]+1 + if distance_x < 0 and larger_layout[2] > smaller_layout[0]: + larger_layout[2] = smaller_layout[0]-1 + else: # 上下重叠 + if distance_y > 0 and larger_layout[1] < smaller_layout[3]: + larger_layout[1] = smaller_layout[3]+1 + if distance_y < 0 and larger_layout[3] > smaller_layout[1]: + larger_layout[3] = smaller_layout[1]-1 + # 排序调整布局边界框列表 + new_bboxes = [] + for layout_bbox in layout_bboxes: + new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None]) + + layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id) + + # 返回排序调整后的布局边界框列表 + return layout_bboxes, layout_tree + + +def layout_detect(layout_info, page: fitz.Page, ocr_page_info): + """ + 对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。 + + Args: + layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。 + + Returns: + list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。 + + """ + page_id = ocr_page_info['page_info']['page_no']-1 + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page) + # 初始化布局边界框列表 + layout_bboxes = [] + # 遍历每个子布局 + for sub_layout in layout_info: + # 提取子布局的边界框坐标信息 + x0, y0, _, _, x1, y1, _, _ = sub_layout['poly'] + bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio), + int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)] + + # 将子布局的边界框添加到列表中 + layout_bboxes.append(bbox) + + # 初始化新的布局边界框列表 + new_layout_bboxes = [] + # 遍历每个布局边界框 + for i in range(len(layout_bboxes)): + # 初始化标记变量,用于判断当前边界框是否需要保留 + keep = True + # 获取当前边界框的坐标信息 + box_i = layout_bboxes[i] + + # 遍历其他边界框 + for j in range(len(layout_bboxes)): + # 排除当前边界框自身 + if i != j: + # 获取其他边界框的坐标信息 + box_j = layout_bboxes[j] + # 检测box_i是否被box_j包含 + if _is_in(box_i, box_j): + # 如果当前边界框被其他边界框包含,则标记为不需要保留 + keep = False + # 跳出内层循环 + break + + # 如果当前边界框需要保留,则添加到新的布局边界框列表中 + if keep: + new_layout_bboxes.append(layout_bboxes[i]) + + # 对新的布局边界框列表进行排序调整 + page_width = page.rect.width + page_height = page.rect.height + page_boundry = [0, 0, page_width, page_height] + layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id) + + # 返回排序调整后的布局边界框列表 + return layout_bboxes, layout_tree diff --git a/magic_pdf/pre_proc/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py new file mode 100644 index 0000000000000000000000000000000000000000..74c1f89f2c7a4c98fd4ae7e971d84f68644c2815 --- /dev/null +++ b/magic_pdf/pre_proc/ocr_dict_merge.py @@ -0,0 +1,336 @@ +from loguru import logger + +from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \ + calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio +from magic_pdf.libs.drop_tag import DropTag +from magic_pdf.libs.ocr_content_type import ContentType, BlockType +from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation +from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span + + +# 将每一个line中的span从左到右排序 +def line_sort_spans_by_left_to_right(lines): + line_objects = [] + for line in lines: + # 按照x0坐标排序 + line.sort(key=lambda span: span['bbox'][0]) + line_bbox = [ + min(span['bbox'][0] for span in line), # x0 + min(span['bbox'][1] for span in line), # y0 + max(span['bbox'][2] for span in line), # x1 + max(span['bbox'][3] for span in line), # y1 + ] + line_objects.append({ + "bbox": line_bbox, + "spans": line, + }) + return line_objects + + +def merge_spans_to_line(spans): + if len(spans) == 0: + return [] + else: + # 按照y0坐标排序 + spans.sort(key=lambda span: span['bbox'][1]) + + lines = [] + current_line = [spans[0]] + for span in spans[1:]: + # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" + # image和table类型,同上 + if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any( + s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in + current_line): + # 则开始新行 + lines.append(current_line) + current_line = [span] + continue + + # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 + if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): + current_line.append(span) + else: + # 否则,开始新行 + lines.append(current_line) + current_line = [span] + + # 添加最后一行 + if current_line: + lines.append(current_line) + + return lines + + +def merge_spans_to_line_by_layout(spans, layout_bboxes): + lines = [] + new_spans = [] + dropped_spans = [] + for item in layout_bboxes: + layout_bbox = item['layout_bbox'] + # 遍历spans,将每个span放入对应的layout中 + layout_sapns = [] + for span in spans: + if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6: + layout_sapns.append(span) + # 如果layout_sapns不为空,则放入new_spans中 + if len(layout_sapns) > 0: + new_spans.append(layout_sapns) + # 从spans删除已经放入layout_sapns中的span + for layout_sapn in layout_sapns: + spans.remove(layout_sapn) + + if len(new_spans) > 0: + for layout_sapns in new_spans: + layout_lines = merge_spans_to_line(layout_sapns) + lines.extend(layout_lines) + + # 对line中的span进行排序 + lines = line_sort_spans_by_left_to_right(lines) + + for span in spans: + span['tag'] = DropTag.NOT_IN_LAYOUT + dropped_spans.append(span) + + return lines, dropped_spans + + +def merge_lines_to_block(lines): + # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox + blocks = [] + for line in lines: + blocks.append( + { + "bbox": line["bbox"], + "lines": [line], + } + ) + return blocks + + +def sort_blocks_by_layout(all_bboxes, layout_bboxes): + new_blocks = [] + sort_blocks = [] + for item in layout_bboxes: + layout_bbox = item['layout_bbox'] + + # 遍历blocks,将每个blocks放入对应的layout中 + layout_blocks = [] + for block in all_bboxes: + # 如果是footnote则跳过 + if block[7] == BlockType.Footnote: + continue + block_bbox = block[:4] + if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8: + layout_blocks.append(block) + + # 如果layout_blocks不为空,则放入new_blocks中 + if len(layout_blocks) > 0: + new_blocks.append(layout_blocks) + # 从all_bboxes删除已经放入layout_blocks中的block + for layout_block in layout_blocks: + all_bboxes.remove(layout_block) + + # 如果new_blocks不为空,则对new_blocks中每个block进行排序 + if len(new_blocks) > 0: + for bboxes_in_layout_block in new_blocks: + bboxes_in_layout_block.sort(key=lambda x: x[1]) # 一个layout内部的box,按照y0自上而下排序 + sort_blocks.extend(bboxes_in_layout_block) + + # sort_blocks中已经包含了当前页面所有最终留下的block,且已经排好了顺序 + return sort_blocks + + +def fill_spans_in_blocks(blocks, spans, radio): + ''' + 将allspans中的span按位置关系,放入blocks中 + ''' + block_with_spans = [] + for block in blocks: + block_type = block[7] + block_bbox = block[0:4] + block_dict = { + 'type': block_type, + 'bbox': block_bbox, + } + block_spans = [] + for span in spans: + span_bbox = span['bbox'] + if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio: + block_spans.append(span) + + '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)''' + # displayed_list = [] + # text_inline_lines = [] + # modify_y_axis(block_spans, displayed_list, text_inline_lines) + + '''模型识别错误的行间公式, type类型转换成行内公式''' + # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines) + + '''bbox去除粘连''' # 去粘连会影响span的bbox,导致后续fill的时候出错 + # block_spans = remove_overlap_between_bbox_for_span(block_spans) + + block_dict['spans'] = block_spans + block_with_spans.append(block_dict) + + # 从spans删除已经放入block_spans中的span + if len(block_spans) > 0: + for span in block_spans: + spans.remove(span) + + return block_with_spans, spans + + +def fix_block_spans(block_with_spans, img_blocks, table_blocks): + ''' + 1、img_block和table_block因为包含caption和footnote的关系,存在block的嵌套关系 + 需要将caption和footnote的text_span放入相应img_block和table_block内的 + caption_block和footnote_block中 + 2、同时需要删除block中的spans字段 + ''' + fix_blocks = [] + for block in block_with_spans: + block_type = block['type'] + + if block_type == BlockType.Image: + block = fix_image_block(block, img_blocks) + elif block_type == BlockType.Table: + block = fix_table_block(block, table_blocks) + elif block_type in [BlockType.Text, BlockType.Title]: + block = fix_text_block(block) + elif block_type == BlockType.InterlineEquation: + block = fix_interline_block(block) + else: + continue + fix_blocks.append(block) + return fix_blocks + + +def fix_discarded_block(discarded_block_with_spans): + fix_discarded_blocks = [] + for block in discarded_block_with_spans: + block = fix_text_block(block) + fix_discarded_blocks.append(block) + return fix_discarded_blocks + + +def merge_spans_to_block(spans: list, block_bbox: list, block_type: str): + block_spans = [] + # 如果有img_caption,则将img_block中的text_spans放入img_caption_block中 + for span in spans: + if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6: + block_spans.append(span) + block_lines = merge_spans_to_line(block_spans) + # 对line中的span进行排序 + sort_block_lines = line_sort_spans_by_left_to_right(block_lines) + block = { + 'bbox': block_bbox, + 'type': block_type, + 'lines': sort_block_lines + } + return block, block_spans + + +def make_body_block(span: dict, block_bbox: list, block_type: str): + # 创建body_block + body_line = { + 'bbox': block_bbox, + 'spans': [span], + } + body_block = { + 'bbox': block_bbox, + 'type': block_type, + 'lines': [body_line] + } + return body_block + + +def fix_image_block(block, img_blocks): + block['blocks'] = [] + # 遍历img_blocks,找到与当前block匹配的img_block + for img_block in img_blocks: + if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95): + + # 创建img_body_block + for span in block['spans']: + if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']: + # 创建img_body_block + img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody) + block['blocks'].append(img_body_block) + + # 从spans中移除img_body_block中已经放入的span + block['spans'].remove(span) + break + + # 根据list长度,判断img_block中是否有img_caption + if img_block['img_caption_bbox'] is not None: + img_caption_block, img_caption_spans = merge_spans_to_block( + block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption + ) + block['blocks'].append(img_caption_block) + + break + del block['spans'] + return block + + +def fix_table_block(block, table_blocks): + block['blocks'] = [] + # 遍历table_blocks,找到与当前block匹配的table_block + for table_block in table_blocks: + if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95): + + # 创建table_body_block + for span in block['spans']: + if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']: + # 创建table_body_block + table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody) + block['blocks'].append(table_body_block) + + # 从spans中移除img_body_block中已经放入的span + block['spans'].remove(span) + break + + # 根据list长度,判断table_block中是否有caption + if table_block['table_caption_bbox'] is not None: + table_caption_block, table_caption_spans = merge_spans_to_block( + block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption + ) + block['blocks'].append(table_caption_block) + + # 如果table_caption_block_spans不为空 + if len(table_caption_spans) > 0: + # 一些span已经放入了caption_block中,需要从block['spans']中删除 + for span in table_caption_spans: + block['spans'].remove(span) + + # 根据list长度,判断table_block中是否有table_note + if table_block['table_footnote_bbox'] is not None: + table_footnote_block, table_footnote_spans = merge_spans_to_block( + block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote + ) + block['blocks'].append(table_footnote_block) + + break + del block['spans'] + return block + + +def fix_text_block(block): + # 文本block中的公式span都应该转换成行内type + for span in block['spans']: + if span['type'] == ContentType.InterlineEquation: + span['type'] = ContentType.InlineEquation + block_lines = merge_spans_to_line(block['spans']) + sort_block_lines = line_sort_spans_by_left_to_right(block_lines) + block['lines'] = sort_block_lines + del block['spans'] + return block + + +def fix_interline_block(block): + block_lines = merge_spans_to_line(block['spans']) + sort_block_lines = line_sort_spans_by_left_to_right(block_lines) + block['lines'] = sort_block_lines + del block['spans'] + return block diff --git a/magic_pdf/pre_proc/ocr_span_list_modify.py b/magic_pdf/pre_proc/ocr_span_list_modify.py new file mode 100644 index 0000000000000000000000000000000000000000..9ed1ea2f82b025ed2d491a91babcdda297165dd1 --- /dev/null +++ b/magic_pdf/pre_proc/ocr_span_list_modify.py @@ -0,0 +1,258 @@ +from loguru import logger + +from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \ + __is_overlaps_y_exceeds_threshold, calculate_iou +from magic_pdf.libs.drop_tag import DropTag +from magic_pdf.libs.ocr_content_type import ContentType, BlockType + +def remove_overlaps_low_confidence_spans(spans): + dropped_spans = [] + # 删除重叠spans中置信度低的的那些 + for span1 in spans: + for span2 in spans: + if span1 != span2: + if calculate_iou(span1['bbox'], span2['bbox']) > 0.9: + if span1['score'] < span2['score']: + span_need_remove = span1 + else: + span_need_remove = span2 + if span_need_remove is not None and span_need_remove not in dropped_spans: + dropped_spans.append(span_need_remove) + + if len(dropped_spans) > 0: + for span_need_remove in dropped_spans: + spans.remove(span_need_remove) + span_need_remove['tag'] = DropTag.SPAN_OVERLAP + + return spans, dropped_spans + + +def remove_overlaps_min_spans(spans): + dropped_spans = [] + # 删除重叠spans中较小的那些 + for span1 in spans: + for span2 in spans: + if span1 != span2: + overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65) + if overlap_box is not None: + span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None) + if span_need_remove is not None and span_need_remove not in dropped_spans: + dropped_spans.append(span_need_remove) + + if len(dropped_spans) > 0: + for span_need_remove in dropped_spans: + spans.remove(span_need_remove) + span_need_remove['tag'] = DropTag.SPAN_OVERLAP + + return spans, dropped_spans + + +def remove_spans_by_bboxes(spans, need_remove_spans_bboxes): + # 遍历spans, 判断是否在removed_span_block_bboxes中 + # 如果是, 则删除该span 否则, 保留该span + need_remove_spans = [] + for span in spans: + for removed_bbox in need_remove_spans_bboxes: + if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5: + if span not in need_remove_spans: + need_remove_spans.append(span) + break + + if len(need_remove_spans) > 0: + for span in need_remove_spans: + spans.remove(span) + + return spans + + +def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict): + dropped_spans = [] + for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items(): + # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}") + need_remove_spans = [] + for span in spans: + # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span + for removed_bbox in removed_bboxes: + if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5: + need_remove_spans.append(span) + break + # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方,如果是,则删除该span + elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \ + removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]: + need_remove_spans.append(span) + break + + for span in need_remove_spans: + spans.remove(span) + span['tag'] = drop_tag + dropped_spans.append(span) + + return spans, dropped_spans + + +def adjust_bbox_for_standalone_block(spans): + # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0 + for sb_span in spans: + if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: + for text_span in spans: + if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]: + # 判断span2的纵向高度是否被span所覆盖 + if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]: + # 判断span2是否在span左边 + if text_span['bbox'][0] < sb_span['bbox'][0]: + # 调整span的y0和span2的y0一致 + sb_span['bbox'][1] = text_span['bbox'][1] + return spans + + +def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list): + # displayed_list = [] + # 如果spans为空,则不处理 + if len(spans) == 0: + pass + else: + spans.sort(key=lambda span: span['bbox'][1]) + + lines = [] + current_line = [spans[0]] + if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: + displayed_list.append(spans[0]) + + line_first_y0 = spans[0]["bbox"][1] + line_first_y = spans[0]["bbox"][3] + # 用于给行间公式搜索 + # text_inline_lines = [] + for span in spans[1:]: + # if span.get("content","") == "78.": + # print("debug") + # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation" + # image和table类型,同上 + if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any( + s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in + current_line): + # 传入 + if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]: + displayed_list.append(span) + # 则开始新行 + lines.append(current_line) + if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]: + text_inline_lines.append((current_line, (line_first_y0, line_first_y))) + current_line = [span] + line_first_y0 = span["bbox"][1] + line_first_y = span["bbox"][3] + continue + + # 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行 + if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']): + if span["type"] == "text": + line_first_y0 = span["bbox"][1] + line_first_y = span["bbox"][3] + current_line.append(span) + + else: + # 否则,开始新行 + lines.append(current_line) + text_inline_lines.append((current_line, (line_first_y0, line_first_y))) + current_line = [span] + line_first_y0 = span["bbox"][1] + line_first_y = span["bbox"][3] + + # 添加最后一行 + if current_line: + lines.append(current_line) + if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]: + text_inline_lines.append((current_line, (line_first_y0, line_first_y))) + for line in text_inline_lines: + # 按照x0坐标排序 + current_line = line[0] + current_line.sort(key=lambda span: span['bbox'][0]) + + # 调整每一个文字行内bbox统一 + for line in text_inline_lines: + current_line, (line_first_y0, line_first_y) = line + for span in current_line: + span["bbox"][1] = line_first_y0 + span["bbox"][3] = line_first_y + + # return spans, displayed_list, text_inline_lines + + +def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list): + # 错误行间公式转行内公式 + j = 0 + for i in range(len(displayed_list)): + # if i == 8: + # print("debug") + span = displayed_list[i] + span_y0, span_y = span["bbox"][1], span["bbox"][3] + + while j < len(text_inline_lines): + text_line = text_inline_lines[j] + y0, y1 = text_line[1] + if ( + span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1 + ) and __is_overlaps_y_exceeds_threshold( + span['bbox'], (0, y0, 0, y1) + ): + # 调整公式类型 + if span["type"] == ContentType.InterlineEquation: + # 最后一行是行间公式 + if j + 1 >= len(text_inline_lines): + span["type"] = ContentType.InlineEquation + span["bbox"][1] = y0 + span["bbox"][3] = y1 + else: + # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换 + y0_next, y1_next = text_inline_lines[j + 1][1] + if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * ( + y1 - y0) > span_y - span_y0: + span["type"] = ContentType.InlineEquation + span["bbox"][1] = y0 + span["bbox"][3] = y1 + break + elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'], + (0, y0, 0, y1)): + break + else: + j += 1 + + return spans + + +def get_qa_need_list(blocks): + # 创建 images, tables, interline_equations, inline_equations 的副本 + images = [] + tables = [] + interline_equations = [] + inline_equations = [] + + for block in blocks: + for line in block["lines"]: + for span in line["spans"]: + if span["type"] == ContentType.Image: + images.append(span) + elif span["type"] == ContentType.Table: + tables.append(span) + elif span["type"] == ContentType.InlineEquation: + inline_equations.append(span) + elif span["type"] == ContentType.InterlineEquation: + interline_equations.append(span) + else: + continue + return images, tables, interline_equations, inline_equations + + +def get_qa_need_list_v2(blocks): + # 创建 images, tables, interline_equations, inline_equations 的副本 + images = [] + tables = [] + interline_equations = [] + + for block in blocks: + if block["type"] == BlockType.Image: + images.append(block) + elif block["type"] == BlockType.Table: + tables.append(block) + elif block["type"] == BlockType.InterlineEquation: + interline_equations.append(block) + return images, tables, interline_equations diff --git a/magic_pdf/pre_proc/pdf_pre_filter.py b/magic_pdf/pre_proc/pdf_pre_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..1704d0efbd5dadd25ae916d3dc7c14d6f2c4e04a --- /dev/null +++ b/magic_pdf/pre_proc/pdf_pre_filter.py @@ -0,0 +1,74 @@ +from magic_pdf.libs.commons import fitz +from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap +from magic_pdf.libs.drop_reason import DropReason + + +def __area(box): + return (box[2] - box[0]) * (box[3] - box[1]) + +def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool: + """ + 检查page是包含有颜色背景的矩形 + """ + color_bg_rect = [] + p_width, p_height = page.rect.width, page.rect.height + + # 先找到最大的带背景矩形 + blocks = page.get_cdrawings() + for block in blocks: + + if 'fill' in block and block['fill']: # 过滤掉透明的 + fill = list(block['fill']) + fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2]) + if fill==(1.0,1.0,1.0): + continue + rect = block['rect'] + # 过滤掉特别小的矩形 + if __area(rect) < 10*10: + continue + # 为了防止是svg图片上的色块,这里过滤掉这类 + + if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]): + continue + color_bg_rect.append(rect) + + # 找到最大的背景矩形 + if len(color_bg_rect) > 0: + max_rect = max(color_bg_rect, key=lambda x:__area(x)) + max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3])) + # 判断最大的背景矩形是否包含超过3行文字,或者50个字 TODO + if max_rect[2]-max_rect[0] > 0.2*p_width and max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合 + #看是否有文本块落入到这个矩形中 + for text_block in text_blocks: + box = text_block['bbox'] + box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3])) + if _is_in(box_int, max_rect_int): + return True + + return False + + +def __is_table_overlap_text_block(text_blocks, table_bbox): + """ + 检查table_bbox是否覆盖了text_blocks里的文本块 + TODO + """ + for text_block in text_blocks: + box = text_block['bbox'] + if _is_in_or_part_overlap(table_bbox, box): + return True + return False + + +def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple: + """ + return:(True|False, err_msg) + True, 如果pdf符合要求 + False, 如果pdf不符合要求 + + """ + if __is_contain_color_background_rect(page, text_blocks, image_bboxes): + return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX} + + + return True, None \ No newline at end of file diff --git a/magic_pdf/pre_proc/post_layout_split.py b/magic_pdf/pre_proc/post_layout_split.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/pre_proc/remove_bbox_overlap.py b/magic_pdf/pre_proc/remove_bbox_overlap.py new file mode 100644 index 0000000000000000000000000000000000000000..2afb07119afe91d412832b5a8766a513c6618560 --- /dev/null +++ b/magic_pdf/pre_proc/remove_bbox_overlap.py @@ -0,0 +1,98 @@ +from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap +from magic_pdf.libs.drop_reason import DropReason + +def _remove_overlap_between_bbox(bbox1, bbox2): + if _is_part_overlap(bbox1, bbox2): + ix0, iy0, ix1, iy1 = bbox1 + x0, y0, x1, y1 = bbox2 + + diff_x = min(x1, ix1) - max(x0, ix0) + diff_y = min(y1, iy1) - max(y0, iy0) + + if diff_y > diff_x: + if x1 >= ix1: + mid = (x0 + ix1) // 2 + ix1 = min(mid - 0.25, ix1) + x0 = max(mid + 0.25, x0) + else: + mid = (ix0 + x1) // 2 + ix0 = max(mid + 0.25, ix0) + x1 = min(mid - 0.25, x1) + else: + if y1 >= iy1: + mid = (y0 + iy1) // 2 + y0 = max(mid + 0.25, y0) + iy1 = min(iy1, mid-0.25) + else: + mid = (iy0 + y1) // 2 + y1 = min(y1, mid-0.25) + iy0 = max(mid + 0.25, iy0) + + if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0: + bbox1 = [ix0, iy0, ix1, iy1] + bbox2 = [x0, y0, x1, y1] + return bbox1, bbox2, None + else: + return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA + else: + return bbox1, bbox2, None + + +def _remove_overlap_between_bboxes(arr): + drop_reasons = [] + N = len(arr) + keeps = [True] * N + res = [None] * N + for i in range(N): + for j in range(N): + if i == j: + continue + if _is_in(arr[i]["bbox"], arr[j]["bbox"]): + keeps[i] = False + + for idx, v in enumerate(arr): + if not keeps[idx]: + continue + for i in range(N): + if res[i] is None: + continue + + bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"]) + if drop_reason is None: + v["bbox"] = bbox1 + res[i]["bbox"] = bbox2 + else: + if v["score"] > res[i]["score"]: + keeps[i] = False + res[i] = None + else: + keeps[idx] = False + drop_reasons.append(drop_reasons) + if keeps[idx]: + res[idx] = v + return res, drop_reasons + + +def remove_overlap_between_bbox_for_span(spans): + arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ] + res, drop_reasons = _remove_overlap_between_bboxes(arr) + ret = [] + for i in range(len(res)): + if res[i] is None: + continue + spans[i]["bbox"] = res[i]["bbox"] + ret.append(spans[i]) + return ret, drop_reasons + + +def remove_overlap_between_bbox_for_block(all_bboxes): + arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ] + res, drop_reasons = _remove_overlap_between_bboxes(arr) + ret = [] + for i in range(len(res)): + if res[i] is None: + continue + all_bboxes[i][:4] = res[i]["bbox"] + ret.append(all_bboxes[i]) + return ret, drop_reasons + diff --git a/magic_pdf/pre_proc/remove_colored_strip_bbox.py b/magic_pdf/pre_proc/remove_colored_strip_bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..17be73aa5010a4f179c2645d972d02afc7ab482b --- /dev/null +++ b/magic_pdf/pre_proc/remove_colored_strip_bbox.py @@ -0,0 +1,79 @@ +from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio +from loguru import logger + +from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK + + +def __area(box): + return (box[2] - box[0]) * (box[3] - box[1]) + + +def rectangle_position_determination(rect, p_width): + """ + 判断矩形是否在页面中轴线附近。 + + Args: + rect (list): 矩形坐标,格式为[x1, y1, x2, y2]。 + p_width (int): 页面宽度。 + + Returns: + bool: 若矩形在页面中轴线附近则返回True,否则返回False。 + """ + # 页面中轴线x坐标 + x_axis = p_width / 2 + # 矩形是否跨越中轴线 + is_span = rect[0] < x_axis and rect[2] > x_axis + if is_span: + return True + else: + # 矩形与中轴线的距离,只算近的那一边 + distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2] + # 判断矩形与中轴线的距离是否小于页面宽度的20% + if distance < p_width * 0.2: + return True + else: + return False + +def remove_colored_strip_textblock(remain_text_blocks, page): + """ + 根据页面中特定颜色和大小过滤文本块,将符合条件的文本块从remain_text_blocks中移除,并返回移除的文本块列表colored_strip_textblock。 + + Args: + remain_text_blocks (list): 剩余文本块列表。 + page (Page): 页面对象。 + + Returns: + tuple: 剩余文本块列表和移除的文本块列表。 + """ + colored_strip_textblocks = [] # 先构造一个空的返回 + if len(remain_text_blocks) > 0: + p_width, p_height = page.rect.width, page.rect.height + blocks = page.get_cdrawings() + colored_strip_bg_rect = [] + for block in blocks: + is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0) # 过滤掉透明的 + rect = block['rect'] + area_is_large_enough = __area(rect) > 100 # 过滤掉特别小的矩形 + rectangle_position_determination_result = rectangle_position_determination(rect, p_width) + in_upper_half_page = rect[3] < p_height * 0.3 # 找到位于页面上半部分的矩形,下边界小于页面高度的30% + aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4 # 找到长宽比超过4的矩形 + + if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4: + colored_strip_bg_rect.append(rect) + + if len(colored_strip_bg_rect) > 0: + for colored_strip_block_bbox in colored_strip_bg_rect: + for text_block in remain_text_blocks: + text_bbox = text_block['bbox'] + if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6): + logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}') + text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK + colored_strip_textblocks.append(text_block) + + if len(colored_strip_textblocks) > 0: + for colored_strip_textblock in colored_strip_textblocks: + if colored_strip_textblock in remain_text_blocks: + remain_text_blocks.remove(colored_strip_textblock) + + return remain_text_blocks, colored_strip_textblocks + diff --git a/magic_pdf/pre_proc/remove_footer_header.py b/magic_pdf/pre_proc/remove_footer_header.py new file mode 100644 index 0000000000000000000000000000000000000000..9e04817c3ea2414c21f3631c6deaa8ad4fc7ff9e --- /dev/null +++ b/magic_pdf/pre_proc/remove_footer_header.py @@ -0,0 +1,117 @@ +import re + +from magic_pdf.libs.boxbase import _is_in_or_part_overlap +from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO + + +def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs, + page_no_bboxs, page_w, page_h): + """ + 删除页眉页脚,页码 + 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中 + """ + header = [] + footer = [] + if len(header) == 0: + model_header = header_bboxs + if model_header: + x0 = min([x for x, _, _, _ in model_header]) + y0 = min([y for _, y, _, _ in model_header]) + x1 = max([x1 for _, _, x1, _ in model_header]) + y1 = max([y1 for _, _, _, y1 in model_header]) + header = [x0, y0, x1, y1] + if len(footer) == 0: + model_footer = footer_bboxs + if model_footer: + x0 = min([x for x, _, _, _ in model_footer]) + y0 = min([y for _, y, _, _ in model_footer]) + x1 = max([x1 for _, _, x1, _ in model_footer]) + y1 = max([y1 for _, _, _, y1 in model_footer]) + footer = [x0, y0, x1, y1] + + header_y0 = 0 if len(header) == 0 else header[3] + footer_y0 = page_h if len(footer) == 0 else footer[1] + if page_no_bboxs: + top_part = [b for b in page_no_bboxs if b[3] < page_h / 2] + btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2] + + top_max_y0 = max([b[1] for b in top_part]) if top_part else 0 + btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h + + header_y0 = max(header_y0, top_max_y0) + footer_y0 = min(footer_y0, btn_min_y1) + + content_boundry = [0, header_y0, page_w, footer_y0] + + header = [0, 0, page_w, header_y0] + footer = [0, footer_y0, page_w, page_h] + + """以上计算出来了页眉页脚的边界,下面开始进行删除""" + text_block_to_remove = [] + # 首先检查每个textblock + for blk in text_raw_blocks: + if len(blk['lines']) > 0: + for line in blk['lines']: + line_del = [] + for span in line['spans']: + span_del = [] + if span['bbox'][3] < header_y0: + span_del.append(span) + elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer): + span_del.append(span) + for span in span_del: + line['spans'].remove(span) + if not line['spans']: + line_del.append(line) + + for line in line_del: + blk['lines'].remove(line) + else: + # if not blk['lines']: + blk['tag'] = CONTENT_IN_FOOT_OR_HEADER + text_block_to_remove.append(blk) + + """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除""" + page_no_block_2_remove = [] + if page_no_bboxs: + for pagenobox in page_no_bboxs: + for block in text_raw_blocks: + if _is_in_or_part_overlap(pagenobox, block['bbox']): # 在span级别删除页码 + for line in block['lines']: + for span in line['spans']: + if _is_in_or_part_overlap(pagenobox, span['bbox']): + # span['text'] = '' + span['tag'] = PAGE_NO + # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除 + if len(line['spans']) == 1 and len(block['lines']) == 1: + page_no_block_2_remove.append(block) + else: + # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字 + if len(text_raw_blocks) > 0: + text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True) + last_block = text_raw_blocks[0] + if len(last_block['lines']) == 1: + last_line = last_block['lines'][0] + if len(last_line['spans']) == 1: + last_span = last_line['spans'][0] + if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]', + last_span[ + 'text']): + last_span['tag'] = PAGE_NO + page_no_block_2_remove.append(last_block) + + for b in page_no_block_2_remove: + text_block_to_remove.append(b) + + for blk in text_block_to_remove: + if blk in text_raw_blocks: + text_raw_blocks.remove(blk) + + text_block_remain = text_raw_blocks + image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)] + + image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)] + table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)] + table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)] + + return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove diff --git a/magic_pdf/pre_proc/remove_rotate_bbox.py b/magic_pdf/pre_proc/remove_rotate_bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..d47aa51c52ae51cc3180478ffcfdc272e47785ec --- /dev/null +++ b/magic_pdf/pre_proc/remove_rotate_bbox.py @@ -0,0 +1,188 @@ +import math + +from magic_pdf.libs.boxbase import is_vbox_on_side +from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT + + +def detect_non_horizontal_texts(result_dict): + """ + This function detects watermarks and vertical margin notes in the document. + + Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages. + If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page. + If the direction of these blocks is not horizontal, they are definitely considered to be watermarks. + + Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages. + If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. + If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes. + + + Parameters + ---------- + result_dict : dict + The result dictionary. + + Returns + ------- + result_dict : dict + The updated result dictionary. + """ + # Dictionary to store information about potential watermarks + potential_watermarks = {} + potential_margin_notes = {} + + for page_id, page_content in result_dict.items(): + if page_id.startswith("page_"): + for block_id, block_data in page_content.items(): + if block_id.startswith("block_"): + if "dir" in block_data: + coordinates_text = (block_data["bbox"], block_data["text"]) # Tuple of coordinates and text + + angle = math.atan2(block_data["dir"][1], block_data["dir"][0]) + angle = abs(math.degrees(angle)) + + if angle > 5 and angle < 85: # Check if direction is watermarks + if coordinates_text in potential_watermarks: + potential_watermarks[coordinates_text] += 1 + else: + potential_watermarks[coordinates_text] = 1 + + if angle > 85 and angle < 105: # Check if direction is vertical + if coordinates_text in potential_margin_notes: + potential_margin_notes[coordinates_text] += 1 # Increment count + else: + potential_margin_notes[coordinates_text] = 1 # Initialize count + + # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages) + watermark_threshold = len(result_dict) // 2 + watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold} + + # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages) + margin_note_threshold = len(result_dict) // 2 + margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold} + + # Add watermark information to the result dictionary + for page_id, blocks in result_dict.items(): + if page_id.startswith("page_"): + for block_id, block_data in blocks.items(): + coordinates_text = (block_data["bbox"], block_data["text"]) + if coordinates_text in watermarks: + block_data["is_watermark"] = 1 + else: + block_data["is_watermark"] = 0 + + if coordinates_text in margin_notes: + block_data["is_vertical_margin_note"] = 1 + else: + block_data["is_vertical_margin_note"] = 0 + + return result_dict + + +""" +1. 当一个block里全部文字都不是dir=(1,0),这个block整体去掉 +2. 当一个block里全部文字都是dir=(1,0),但是每行只有一个字,这个block整体去掉。这个block必须出现在页面的四周,否则不去掉 +""" +import re + +def __is_a_word(sentence): + # 如果输入是中文并且长度为1,则返回True + if re.fullmatch(r'[\u4e00-\u9fa5]', sentence): + return True + # 判断是否为单个英文单词或字符(包括ASCII标点) + elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2: + return True + else: + return False + + +def __get_text_color(num): + """获取字体的颜色RGB值""" + blue = num & 255 + green = (num >> 8) & 255 + red = (num >> 16) & 255 + return red, green, blue + + +def __is_empty_side_box(text_block): + """ + 是否是边缘上的空白没有任何内容的block + """ + for line in text_block['lines']: + for span in line['spans']: + font_color = span['color'] + r,g,b = __get_text_color(font_color) + if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255): + return False + + return True + + +def remove_rotate_side_textblock(pymu_text_block, page_width, page_height): + """ + 返回删除了垂直,水印,旋转的textblock + 删除的内容打上tag返回 + """ + removed_text_block = [] + + for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json + lines = block['lines'] + block_bbox = block['bbox'] + if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边 + continue + + if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]): + is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1) # 测试bbox在垂直方向是不是x0都相等,也就是在垂直方向排列.同时必须大于等于2个字 + + if is_box_valign: + block['tag'] = VERTICAL_TEXT + removed_text_block.append(block) + continue + + for line in lines: + if line['dir']!=(1,0): + block['tag'] = ROTATE_TEXT + removed_text_block.append(block) # 只要有一个line不是dir=(1,0),就把整个block都删掉 + break + + for block in removed_text_block: + pymu_text_block.remove(block) + + return pymu_text_block, removed_text_block + +def get_side_boundry(rotate_bbox, page_width, page_height): + """ + 根据rotate_bbox,返回页面的左右正文边界 + """ + left_x = 0 + right_x = page_width + for x in rotate_bbox: + box = x['bbox'] + if box[2]0.05: + # text_block['tag'] = "belong-to-interline-equation" + # text_block_removed_2.append(text_block) + # break + + # for tb in text_block_removed_2: + # if tb in text_raw_blocks: + # text_raw_blocks.remove(tb) + + # text_block_removed = text_block_removed + text_block_removed_2 + + return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2 + + +def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool: + """ + 检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 + 因为这种情况大概率发生了公式没有被检测出来。 + + """ + if len(text_blocks) == 0: + return False + + page_min_y = 0 + page_max_y = max(yy['bbox'][3] for yy in text_blocks) + + def __max_y(lst: list): + if len(lst) > 0: + return max([item[1] for item in lst]) + return page_min_y + + def __min_y(lst: list): + if len(lst) > 0: + return min([item[3] for item in lst]) + return page_max_y + + clip_y0 = __max_y(header) + clip_y1 = __min_y(footer) + + txt_bboxes = [] + for text_block in text_blocks: + bbox = text_block["bbox"] + if bbox[1] >= clip_y0 and bbox[3] <= clip_y1: + txt_bboxes.append(bbox) + + for i in range(len(txt_bboxes)): + for j in range(i + 1, len(txt_bboxes)): + if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]): + return True + + return False + + +def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool: + """ + 检查文本block之间的水平重叠情况,这种情况如果发生,那么这个pdf就不再继续处理了。 + 因为这种情况大概率发生了公式没有被检测出来。 + + """ + if len(useful_blocks) == 0: + return False + + page_min_y = 0 + page_max_y = max(yy['bbox'][3] for yy in useful_blocks) + + useful_bboxes = [] + for text_block in useful_blocks: + bbox = text_block["bbox"] + if bbox[1] >= page_min_y and bbox[3] <= page_max_y: + useful_bboxes.append(bbox) + + for i in range(len(useful_bboxes)): + for j in range(i + 1, len(useful_bboxes)): + area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1]) + area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1]) + if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]): + if area_i > area_j: + return True, useful_bboxes[j], useful_bboxes[i] + else: + return True, useful_bboxes[i], useful_bboxes[j] + + return False, None, None diff --git a/magic_pdf/pre_proc/solve_line_alien.py b/magic_pdf/pre_proc/solve_line_alien.py new file mode 100644 index 0000000000000000000000000000000000000000..966fb89ead2abb454cf940c7941c3d3d65a532a6 --- /dev/null +++ b/magic_pdf/pre_proc/solve_line_alien.py @@ -0,0 +1,29 @@ +def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block + """解决行内文本间距过大问题""" + for i in range(len(pdf_info_dict)): + + text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks'] + + for block in text_blocks: + + x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0 + + for line in block['lines']: + + x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox'] + # line_box = [x1, y1, x2, y2] + if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2): + # if len(line['spans']) == 1: + line['spans'][0]['text'] = ' ' + line['spans'][0]['text'] + + x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] + + return pdf_info_dict + + + + + + + + diff --git a/magic_pdf/pre_proc/statistics.py b/magic_pdf/pre_proc/statistics.py new file mode 100644 index 0000000000000000000000000000000000000000..5bf7c78fe4fb02c0036f25c463720361f905dffd --- /dev/null +++ b/magic_pdf/pre_proc/statistics.py @@ -0,0 +1,12 @@ + +""" +统计处需要跨页、全局性的数据 +- 统计出字号从大到小 +- 正文区域占比最高的前5 +- 正文平均行间距 +- 正文平均字间距 +- 正文平均字符宽度 +- 正文平均字符高度 + +""" + diff --git a/magic_pdf/resources/model_config/UniMERNet/demo.yaml b/magic_pdf/resources/model_config/UniMERNet/demo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c0db3064997623fe5501411ca5de20cccde6a405 --- /dev/null +++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml @@ -0,0 +1,46 @@ +model: + arch: unimernet + model_type: unimernet + model_config: + model_name: ./models + max_seq_len: 1024 + length_aware: False + load_pretrained: True + pretrained: ./models/pytorch_model.bin + tokenizer_config: + path: ./models + +datasets: + formula_rec_eval: + vis_processor: + eval: + name: "formula_image_eval" + image_size: + - 192 + - 672 + +run: + runner: runner_iter + task: unimernet_train + + batch_size_train: 64 + batch_size_eval: 64 + num_workers: 1 + + iters_per_inner_epoch: 2000 + max_iters: 60000 + + seed: 42 + output_dir: "../output/demo" + + evaluate: True + test_splits: [ "eval" ] + + device: "cuda" + world_size: 1 + dist_url: "env://" + distributed: True + distributed_type: ddp # or fsdp when train llm + + generate_cfg: + temperature: 0.0 diff --git a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6f47cb9cde0c59d38ea1a736c4c6d7ad9efddaa3 --- /dev/null +++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml @@ -0,0 +1,351 @@ +AUG: + DETR: true +CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface +CUDNN_BENCHMARK: false +DATALOADER: + ASPECT_RATIO_GROUPING: true + FILTER_EMPTY_ANNOTATIONS: false + NUM_WORKERS: 4 + REPEAT_THRESHOLD: 0.0 + SAMPLER_TRAIN: TrainingSampler +DATASETS: + PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000 + PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000 + PROPOSAL_FILES_TEST: [] + PROPOSAL_FILES_TRAIN: [] + TEST: + - scihub_train + TRAIN: + - scihub_train +GLOBAL: + HACK: 1.0 +ICDAR_DATA_DIR_TEST: '' +ICDAR_DATA_DIR_TRAIN: '' +INPUT: + CROP: + ENABLED: true + SIZE: + - 384 + - 600 + TYPE: absolute_range + FORMAT: RGB + MASK_FORMAT: polygon + MAX_SIZE_TEST: 1333 + MAX_SIZE_TRAIN: 1333 + MIN_SIZE_TEST: 800 + MIN_SIZE_TRAIN: + - 480 + - 512 + - 544 + - 576 + - 608 + - 640 + - 672 + - 704 + - 736 + - 768 + - 800 + MIN_SIZE_TRAIN_SAMPLING: choice + RANDOM_FLIP: horizontal +MODEL: + ANCHOR_GENERATOR: + ANGLES: + - - -90 + - 0 + - 90 + ASPECT_RATIOS: + - - 0.5 + - 1.0 + - 2.0 + NAME: DefaultAnchorGenerator + OFFSET: 0.0 + SIZES: + - - 32 + - - 64 + - - 128 + - - 256 + - - 512 + BACKBONE: + FREEZE_AT: 2 + NAME: build_vit_fpn_backbone + CONFIG_PATH: '' + DEVICE: cuda + FPN: + FUSE_TYPE: sum + IN_FEATURES: + - layer3 + - layer5 + - layer7 + - layer11 + NORM: '' + OUT_CHANNELS: 256 + IMAGE_ONLY: true + KEYPOINT_ON: false + LOAD_PROPOSALS: false + MASK_ON: true + META_ARCHITECTURE: VLGeneralizedRCNN + PANOPTIC_FPN: + COMBINE: + ENABLED: true + INSTANCES_CONFIDENCE_THRESH: 0.5 + OVERLAP_THRESH: 0.5 + STUFF_AREA_LIMIT: 4096 + INSTANCE_LOSS_WEIGHT: 1.0 + PIXEL_MEAN: + - 127.5 + - 127.5 + - 127.5 + PIXEL_STD: + - 127.5 + - 127.5 + - 127.5 + PROPOSAL_GENERATOR: + MIN_SIZE: 0 + NAME: RPN + RESNETS: + DEFORM_MODULATED: false + DEFORM_NUM_GROUPS: 1 + DEFORM_ON_PER_STAGE: + - false + - false + - false + - false + DEPTH: 50 + NORM: FrozenBN + NUM_GROUPS: 1 + OUT_FEATURES: + - res4 + RES2_OUT_CHANNELS: 256 + RES5_DILATION: 1 + STEM_OUT_CHANNELS: 64 + STRIDE_IN_1X1: true + WIDTH_PER_GROUP: 64 + RETINANET: + BBOX_REG_LOSS_TYPE: smooth_l1 + BBOX_REG_WEIGHTS: + - 1.0 + - 1.0 + - 1.0 + - 1.0 + FOCAL_LOSS_ALPHA: 0.25 + FOCAL_LOSS_GAMMA: 2.0 + IN_FEATURES: + - p3 + - p4 + - p5 + - p6 + - p7 + IOU_LABELS: + - 0 + - -1 + - 1 + IOU_THRESHOLDS: + - 0.4 + - 0.5 + NMS_THRESH_TEST: 0.5 + NORM: '' + NUM_CLASSES: 10 + NUM_CONVS: 4 + PRIOR_PROB: 0.01 + SCORE_THRESH_TEST: 0.05 + SMOOTH_L1_LOSS_BETA: 0.1 + TOPK_CANDIDATES_TEST: 1000 + ROI_BOX_CASCADE_HEAD: + BBOX_REG_WEIGHTS: + - - 10.0 + - 10.0 + - 5.0 + - 5.0 + - - 20.0 + - 20.0 + - 10.0 + - 10.0 + - - 30.0 + - 30.0 + - 15.0 + - 15.0 + IOUS: + - 0.5 + - 0.6 + - 0.7 + ROI_BOX_HEAD: + BBOX_REG_LOSS_TYPE: smooth_l1 + BBOX_REG_LOSS_WEIGHT: 1.0 + BBOX_REG_WEIGHTS: + - 10.0 + - 10.0 + - 5.0 + - 5.0 + CLS_AGNOSTIC_BBOX_REG: true + CONV_DIM: 256 + FC_DIM: 1024 + NAME: FastRCNNConvFCHead + NORM: '' + NUM_CONV: 0 + NUM_FC: 2 + POOLER_RESOLUTION: 7 + POOLER_SAMPLING_RATIO: 0 + POOLER_TYPE: ROIAlignV2 + SMOOTH_L1_BETA: 0.0 + TRAIN_ON_PRED_BOXES: false + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 512 + IN_FEATURES: + - p2 + - p3 + - p4 + - p5 + IOU_LABELS: + - 0 + - 1 + IOU_THRESHOLDS: + - 0.5 + NAME: CascadeROIHeads + NMS_THRESH_TEST: 0.5 + NUM_CLASSES: 10 + POSITIVE_FRACTION: 0.25 + PROPOSAL_APPEND_GT: true + SCORE_THRESH_TEST: 0.05 + ROI_KEYPOINT_HEAD: + CONV_DIMS: + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + - 512 + LOSS_WEIGHT: 1.0 + MIN_KEYPOINTS_PER_IMAGE: 1 + NAME: KRCNNConvDeconvUpsampleHead + NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true + NUM_KEYPOINTS: 17 + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 0 + POOLER_TYPE: ROIAlignV2 + ROI_MASK_HEAD: + CLS_AGNOSTIC_MASK: false + CONV_DIM: 256 + NAME: MaskRCNNConvUpsampleHead + NORM: '' + NUM_CONV: 4 + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 0 + POOLER_TYPE: ROIAlignV2 + RPN: + BATCH_SIZE_PER_IMAGE: 256 + BBOX_REG_LOSS_TYPE: smooth_l1 + BBOX_REG_LOSS_WEIGHT: 1.0 + BBOX_REG_WEIGHTS: + - 1.0 + - 1.0 + - 1.0 + - 1.0 + BOUNDARY_THRESH: -1 + CONV_DIMS: + - -1 + HEAD_NAME: StandardRPNHead + IN_FEATURES: + - p2 + - p3 + - p4 + - p5 + - p6 + IOU_LABELS: + - 0 + - -1 + - 1 + IOU_THRESHOLDS: + - 0.3 + - 0.7 + LOSS_WEIGHT: 1.0 + NMS_THRESH: 0.7 + POSITIVE_FRACTION: 0.5 + POST_NMS_TOPK_TEST: 1000 + POST_NMS_TOPK_TRAIN: 2000 + PRE_NMS_TOPK_TEST: 1000 + PRE_NMS_TOPK_TRAIN: 2000 + SMOOTH_L1_BETA: 0.0 + SEM_SEG_HEAD: + COMMON_STRIDE: 4 + CONVS_DIM: 128 + IGNORE_VALUE: 255 + IN_FEATURES: + - p2 + - p3 + - p4 + - p5 + LOSS_WEIGHT: 1.0 + NAME: SemSegFPNHead + NORM: GN + NUM_CLASSES: 10 + VIT: + DROP_PATH: 0.1 + IMG_SIZE: + - 224 + - 224 + NAME: layoutlmv3_base + OUT_FEATURES: + - layer3 + - layer5 + - layer7 + - layer11 + POS_TYPE: abs + WEIGHTS: +OUTPUT_DIR: +SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train +SEED: 42 +SOLVER: + AMP: + ENABLED: true + BACKBONE_MULTIPLIER: 1.0 + BASE_LR: 0.0002 + BIAS_LR_FACTOR: 1.0 + CHECKPOINT_PERIOD: 2000 + CLIP_GRADIENTS: + CLIP_TYPE: full_model + CLIP_VALUE: 1.0 + ENABLED: true + NORM_TYPE: 2.0 + GAMMA: 0.1 + GRADIENT_ACCUMULATION_STEPS: 1 + IMS_PER_BATCH: 32 + LR_SCHEDULER_NAME: WarmupCosineLR + MAX_ITER: 20000 + MOMENTUM: 0.9 + NESTEROV: false + OPTIMIZER: ADAMW + REFERENCE_WORLD_SIZE: 0 + STEPS: + - 10000 + WARMUP_FACTOR: 0.01 + WARMUP_ITERS: 333 + WARMUP_METHOD: linear + WEIGHT_DECAY: 0.05 + WEIGHT_DECAY_BIAS: null + WEIGHT_DECAY_NORM: 0.0 +TEST: + AUG: + ENABLED: false + FLIP: true + MAX_SIZE: 4000 + MIN_SIZES: + - 400 + - 500 + - 600 + - 700 + - 800 + - 900 + - 1000 + - 1100 + - 1200 + DETECTIONS_PER_IMAGE: 100 + EVAL_PERIOD: 1000 + EXPECTED_RESULTS: [] + KEYPOINT_OKS_SIGMAS: [] + PRECISE_BN: + ENABLED: false + NUM_ITER: 200 +VERSION: 2 +VIS_PERIOD: 0 diff --git a/magic_pdf/resources/model_config/model_configs.yaml b/magic_pdf/resources/model_config/model_configs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44cc8889178ce9a624c2ec92921640a553295ef1 --- /dev/null +++ b/magic_pdf/resources/model_config/model_configs.yaml @@ -0,0 +1,9 @@ +config: + device: cpu + layout: True + formula: True + +weights: + layout: Layout/model_final.pth + mfd: MFD/weights.pt + mfr: MFR/UniMERNet diff --git a/magic_pdf/rw/AbsReaderWriter.py b/magic_pdf/rw/AbsReaderWriter.py new file mode 100644 index 0000000000000000000000000000000000000000..ce5e0774f107d9c035263fad4ccb5a4d2b567ed6 --- /dev/null +++ b/magic_pdf/rw/AbsReaderWriter.py @@ -0,0 +1,34 @@ +from abc import ABC, abstractmethod + + +class AbsReaderWriter(ABC): + """ + 同时支持二进制和文本读写的抽象类 + """ + MODE_TXT = "text" + MODE_BIN = "binary" + + def __init__(self, parent_path): + # 初始化代码可以在这里添加,如果需要的话 + self.parent_path = parent_path # 对于本地目录是父目录,对于s3是会写到这个path下。 + + @abstractmethod + def read(self, path: str, mode=MODE_TXT): + """ + 无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path + """ + raise NotImplementedError + + @abstractmethod + def write(self, content: str, path: str, mode=MODE_TXT): + """ + 无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path + """ + raise NotImplementedError + + @abstractmethod + def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'): + """ + 无论对于本地还是s3的路径,检查如果path是绝对路径,那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path + """ + raise NotImplementedError diff --git a/magic_pdf/rw/DiskReaderWriter.py b/magic_pdf/rw/DiskReaderWriter.py new file mode 100644 index 0000000000000000000000000000000000000000..4e034c80403fc6206d9fae3b6c09bccf10b6deb8 --- /dev/null +++ b/magic_pdf/rw/DiskReaderWriter.py @@ -0,0 +1,66 @@ +import os +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from loguru import logger + + +MODE_TXT = "text" +MODE_BIN = "binary" + + +class DiskReaderWriter(AbsReaderWriter): + + def __init__(self, parent_path, encoding="utf-8"): + self.path = parent_path + self.encoding = encoding + + def read(self, path, mode=MODE_TXT): + if os.path.isabs(path): + abspath = path + else: + abspath = os.path.join(self.path, path) + if not os.path.exists(abspath): + logger.error(f"文件 {abspath} 不存在") + raise Exception(f"文件 {abspath} 不存在") + if mode == MODE_TXT: + with open(abspath, "r", encoding=self.encoding) as f: + return f.read() + elif mode == MODE_BIN: + with open(abspath, "rb") as f: + return f.read() + else: + raise ValueError("Invalid mode. Use 'text' or 'binary'.") + + def write(self, content, path, mode=MODE_TXT): + if os.path.isabs(path): + abspath = path + else: + abspath = os.path.join(self.path, path) + directory_path = os.path.dirname(abspath) + if not os.path.exists(directory_path): + os.makedirs(directory_path) + if mode == MODE_TXT: + with open(abspath, "w", encoding=self.encoding, errors="replace") as f: + f.write(content) + + elif mode == MODE_BIN: + with open(abspath, "wb") as f: + f.write(content) + else: + raise ValueError("Invalid mode. Use 'text' or 'binary'.") + + def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"): + return self.read(path) + + +# 使用示例 +if __name__ == "__main__": + file_path = "io/test/example.txt" + drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf") + + # 写入内容到文件 + drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary") + + # 从文件读取内容 + content = drw.read(path=file_path) + if content: + logger.info(f"从 {file_path} 读取的内容: {content}") diff --git a/magic_pdf/rw/S3ReaderWriter.py b/magic_pdf/rw/S3ReaderWriter.py new file mode 100644 index 0000000000000000000000000000000000000000..df394a1e053884f030098a950f26e322f2d0da4c --- /dev/null +++ b/magic_pdf/rw/S3ReaderWriter.py @@ -0,0 +1,107 @@ +from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter +from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path +import boto3 +from loguru import logger +from boto3.s3.transfer import TransferConfig +from botocore.config import Config +import os + +MODE_TXT = "text" +MODE_BIN = "binary" + + +class S3ReaderWriter(AbsReaderWriter): + def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''): + self.client = self._get_client(ak, sk, endpoint_url, addressing_style) + self.path = parent_path + + def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str): + s3_client = boto3.client( + service_name="s3", + aws_access_key_id=ak, + aws_secret_access_key=sk, + endpoint_url=endpoint_url, + config=Config(s3={"addressing_style": addressing_style}, + retries={'max_attempts': 5, 'mode': 'standard'}), + ) + return s3_client + + def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"): + if s3_relative_path.startswith("s3://"): + s3_path = s3_relative_path + else: + s3_path = join_path(self.path, s3_relative_path) + bucket_name, key = parse_bucket_key(s3_path) + res = self.client.get_object(Bucket=bucket_name, Key=key) + body = res["Body"].read() + if mode == MODE_TXT: + data = body.decode(encoding) # Decode bytes to text + elif mode == MODE_BIN: + data = body + else: + raise ValueError("Invalid mode. Use 'text' or 'binary'.") + return data + + def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"): + if s3_relative_path.startswith("s3://"): + s3_path = s3_relative_path + else: + s3_path = join_path(self.path, s3_relative_path) + if mode == MODE_TXT: + body = content.encode(encoding) # Encode text data as bytes + elif mode == MODE_BIN: + body = content + else: + raise ValueError("Invalid mode. Use 'text' or 'binary'.") + bucket_name, key = parse_bucket_key(s3_path) + self.client.put_object(Body=body, Bucket=bucket_name, Key=key) + logger.info(f"内容已写入 {s3_path} ") + + def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'): + if path.startswith("s3://"): + s3_path = path + else: + s3_path = join_path(self.path, path) + bucket_name, key = parse_bucket_key(s3_path) + + range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-' + res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header) + body = res["Body"].read() + if mode == MODE_TXT: + data = body.decode(encoding) # Decode bytes to text + elif mode == MODE_BIN: + data = body + else: + raise ValueError("Invalid mode. Use 'text' or 'binary'.") + return data + + +if __name__ == "__main__": + # Config the connection info + ak = "" + sk = "" + endpoint_url = "" + addressing_style = "auto" + bucket_name = "" + # Create an S3ReaderWriter object + s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/") + + # Write text data to S3 + text_data = "This is some text data" + s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT) + + # Read text data from S3 + text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT) + logger.info(f"Read text data from S3: {text_data_read}") + # Write binary data to S3 + binary_data = b"This is some binary data" + s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN) + + # Read binary data from S3 + binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN) + logger.info(f"Read binary data from S3: {binary_data_read}") + + # Range Read text data from S3 + binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json", + byte_start=0, byte_end=10, mode=MODE_BIN) + logger.info(f"Read binary data from S3: {binary_data_read}") diff --git a/magic_pdf/rw/__init__.py b/magic_pdf/rw/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/spark/__init__.py b/magic_pdf/spark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/spark/spark_api.py b/magic_pdf/spark/spark_api.py new file mode 100644 index 0000000000000000000000000000000000000000..bf0045ac7518049a5b38f0623a8cb500d2cddd24 --- /dev/null +++ b/magic_pdf/spark/spark_api.py @@ -0,0 +1,51 @@ +from loguru import logger + +from magic_pdf.libs.drop_reason import DropReason + + +def get_data_source(jso: dict): + data_source = jso.get("data_source") + if data_source is None: + data_source = jso.get("file_source") + return data_source + + +def get_data_type(jso: dict): + data_type = jso.get("data_type") + if data_type is None: + data_type = jso.get("file_type") + return data_type + + +def get_bookid(jso: dict): + book_id = jso.get("bookid") + if book_id is None: + book_id = jso.get("original_file_id") + return book_id + + +def exception_handler(jso: dict, e): + logger.exception(e) + jso["_need_drop"] = True + jso["_drop_reason"] = DropReason.Exception + jso["_exception"] = f"ERROR: {e}" + return jso + + +def get_bookname(jso: dict): + data_source = get_data_source(jso) + file_id = jso.get("file_id") + book_name = f"{data_source}/{file_id}" + return book_name + + +def spark_json_extractor(jso: dict) -> dict: + + """ + 从json中提取数据,返回一个dict + """ + + return { + "_pdf_type": jso["_pdf_type"], + "model_list": jso["doc_layout_result"], + } diff --git a/magic_pdf/train_utils/__init__.py b/magic_pdf/train_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/magic_pdf/train_utils/convert_to_train_format.py b/magic_pdf/train_utils/convert_to_train_format.py new file mode 100644 index 0000000000000000000000000000000000000000..c50e6b3b3df51ad8958db1654ab891ea4ac01204 --- /dev/null +++ b/magic_pdf/train_utils/convert_to_train_format.py @@ -0,0 +1,65 @@ +def convert_to_train_format(jso: dict) -> []: + pages = [] + for k, v in jso.items(): + if not k.startswith("page_"): + continue + page_idx = v["page_idx"] + width, height = v["page_size"] + + info = {"page_info": {"page_no": page_idx, "height": height, "width": width}} + + bboxes: list[dict] = [] + for img_bbox in v["image_bboxes_with_caption"]: + bbox = {"category_id": 1, "bbox": img_bbox["bbox"]} + if "caption" in img_bbox: + bbox["caption_bbox"] = img_bbox["caption"] + bboxes.append(bbox) + + for tbl_bbox in v["table_bboxes_with_caption"]: + bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]} + if "caption" in tbl_bbox: + bbox["caption_bbox"] = tbl_bbox["caption"] + bboxes.append(bbox) + + for bbox in v["bak_page_no_bboxes"]: + n_bbox = {"category_id": 4, "bbox": bbox} + bboxes.append(n_bbox) + + for bbox in v["bak_header_bboxes"]: + n_bbox = {"category_id": 3, "bbox": bbox} + bboxes.append(n_bbox) + + for bbox in v["bak_footer_bboxes"]: + n_bbox = {"category_id": 6, "bbox": bbox} + bboxes.append(n_bbox) + + # 脚注, 目前没有看到例子 + for para in v["para_blocks"]: + if "paras" in para: + paras = para["paras"] + for para_key, para_content in paras.items(): + para_bbox = para_content["para_bbox"] + is_para_title = para_content["is_para_title"] + if is_para_title: + n_bbox = {"category_id": 0, "bbox": para_bbox} + else: + n_bbox = {"category_id": 2, "bbox": para_bbox} + bboxes.append(n_bbox) + + for inline_equation in v["inline_equations"]: + n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]} + bboxes.append(n_bbox) + + for inter_equation in v["interline_equations"]: + n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]} + bboxes.append(n_bbox) + + for footnote_bbox in v["bak_footer_note_bboxes"]: + n_bbox = {"category_id": 5, "bbox": list(footnote_bbox)} + bboxes.append(n_bbox) + + info["bboxes"] = bboxes + info["layout_tree"] = v["layout_bboxes"] + pages.append(info) + + return pages diff --git a/magic_pdf/train_utils/extract_caption.py b/magic_pdf/train_utils/extract_caption.py new file mode 100644 index 0000000000000000000000000000000000000000..74e0c51d05a917e1864a0003cd572ff9c508626a --- /dev/null +++ b/magic_pdf/train_utils/extract_caption.py @@ -0,0 +1,59 @@ +from magic_pdf.libs.boxbase import _is_in + + +def extract_caption_bbox(outer: list, inner: list) -> list: + """ + ret: list of { + "bbox": [1,2,3,4], + "caption": [5,6,7,8] # may existed + } + + """ + found_count = 0 # for debug + print(outer, inner) + + def is_float_equal(a, b): + if 0.01 > abs(a - b): # non strict float equal compare + return True + return False + + outer_h = {i: outer[i] for i in range(len(outer))} + ret = [] + for v in inner: + ix0, iy0, ix1, iy1 = v + found_idx = None + d = {"bbox": v[:4]} + for k in outer_h: + ox0, oy0, ox1, oy1 = outer_h[k] + equal_float_flags = [ + is_float_equal(ix0, ox0), + is_float_equal(iy0, oy0), + is_float_equal(ix1, ox1), + is_float_equal(iy1, oy1), + ] + if _is_in(v, outer_h[k]) and not all(equal_float_flags): + found_idx = k + break + if found_idx is not None: + found_count += 1 + captions: list[list] = [] + ox0, oy0, ox1, oy1 = outer_h[found_idx] + captions = [ + [ox0, oy0, ix0, oy1], + [ox0, oy0, ox1, iy0], + [ox0, iy1, ox1, oy1], + [ix1, oy0, ox1, oy1], + ] + captions = sorted( + captions, + key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]), + ) # 面积最大的框就是caption + d["caption"] = captions[-1] + outer_h.pop( + found_idx + ) # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。 + + ret.append(d) + + print("found_count: ", found_count) + return ret diff --git a/magic_pdf/train_utils/remove_footer_header.py b/magic_pdf/train_utils/remove_footer_header.py new file mode 100644 index 0000000000000000000000000000000000000000..57bd3e9e5d134dfe489a2bc1d377faf97db6afa8 --- /dev/null +++ b/magic_pdf/train_utils/remove_footer_header.py @@ -0,0 +1,159 @@ +import re + +from magic_pdf.libs.boxbase import _is_in_or_part_overlap +from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO + + +""" + copy from pre_proc/remove_footer_header.py +""" + + +def remove_headder_footer_one_page( + text_raw_blocks, + image_bboxes, + table_bboxes, + header_bboxs, + footer_bboxs, + page_no_bboxs, + page_w, + page_h, +): + """ + 删除页眉页脚,页码 + 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中 + """ + if 1: + return image_bboxes, table_bboxes, text_raw_blocks, [], [], [] + + header = [] + footer = [] + if len(header) == 0: + model_header = header_bboxs + if model_header: + x0 = min([x for x, _, _, _ in model_header]) + y0 = min([y for _, y, _, _ in model_header]) + x1 = max([x1 for _, _, x1, _ in model_header]) + y1 = max([y1 for _, _, _, y1 in model_header]) + header = [x0, y0, x1, y1] + if len(footer) == 0: + model_footer = footer_bboxs + if model_footer: + x0 = min([x for x, _, _, _ in model_footer]) + y0 = min([y for _, y, _, _ in model_footer]) + x1 = max([x1 for _, _, x1, _ in model_footer]) + y1 = max([y1 for _, _, _, y1 in model_footer]) + footer = [x0, y0, x1, y1] + + header_y0 = 0 if len(header) == 0 else header[3] + footer_y0 = page_h if len(footer) == 0 else footer[1] + if page_no_bboxs: + top_part = [b for b in page_no_bboxs if b[3] < page_h / 2] + btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2] + + top_max_y0 = max([b[1] for b in top_part]) if top_part else 0 + btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h + + header_y0 = max(header_y0, top_max_y0) + footer_y0 = min(footer_y0, btn_min_y1) + + content_boundry = [0, header_y0, page_w, footer_y0] + + header = [0, 0, page_w, header_y0] + footer = [0, footer_y0, page_w, page_h] + + """以上计算出来了页眉页脚的边界,下面开始进行删除""" + text_block_to_remove = [] + # 首先检查每个textblock + for blk in text_raw_blocks: + if len(blk["lines"]) > 0: + for line in blk["lines"]: + line_del = [] + for span in line["spans"]: + span_del = [] + if span["bbox"][3] < header_y0: + span_del.append(span) + elif _is_in_or_part_overlap( + span["bbox"], header + ) or _is_in_or_part_overlap(span["bbox"], footer): + span_del.append(span) + for span in span_del: + line["spans"].remove(span) + if not line["spans"]: + line_del.append(line) + + for line in line_del: + blk["lines"].remove(line) + else: + # if not blk['lines']: + blk["tag"] = CONTENT_IN_FOOT_OR_HEADER + text_block_to_remove.append(blk) + + """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除""" + page_no_block_2_remove = [] + if page_no_bboxs: + for pagenobox in page_no_bboxs: + for block in text_raw_blocks: + if _is_in_or_part_overlap( + pagenobox, block["bbox"] + ): # 在span级别删除页码 + for line in block["lines"]: + for span in line["spans"]: + if _is_in_or_part_overlap(pagenobox, span["bbox"]): + # span['text'] = '' + span["tag"] = PAGE_NO + # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除 + if len(line["spans"]) == 1 and len(block["lines"]) == 1: + page_no_block_2_remove.append(block) + else: + # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字 + if len(text_raw_blocks) > 0: + text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True) + last_block = text_raw_blocks[0] + if len(last_block["lines"]) == 1: + last_line = last_block["lines"][0] + if len(last_line["spans"]) == 1: + last_span = last_line["spans"][0] + if ( + last_span["text"].strip() + and not re.search("[a-zA-Z]", last_span["text"]) + and re.search("[0-9]", last_span["text"]) + ): + last_span["tag"] = PAGE_NO + page_no_block_2_remove.append(last_block) + + for b in page_no_block_2_remove: + text_block_to_remove.append(b) + + for blk in text_block_to_remove: + if blk in text_raw_blocks: + text_raw_blocks.remove(blk) + + text_block_remain = text_raw_blocks + image_bbox_to_remove = [ + bbox + for bbox in image_bboxes + if not _is_in_or_part_overlap(bbox, content_boundry) + ] + + image_bbox_remain = [ + bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry) + ] + table_bbox_to_remove = [ + bbox + for bbox in table_bboxes + if not _is_in_or_part_overlap(bbox, content_boundry) + ] + table_bbox_remain = [ + bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry) + ] + + # 1, 2, 3 + return ( + image_bbox_remain, + table_bbox_remain, + text_block_remain, + text_block_to_remove, + image_bbox_to_remove, + table_bbox_to_remove, + ) diff --git a/magic_pdf/train_utils/vis_utils.py b/magic_pdf/train_utils/vis_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..996ae514098a9277f0e1a48c02a93ef7be08418d --- /dev/null +++ b/magic_pdf/train_utils/vis_utils.py @@ -0,0 +1,327 @@ +from magic_pdf.libs.commons import fitz +import os +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def draw_model_output( + raw_pdf_doc: fitz.Document, paras_dict_arr: list[dict], save_path: str +): + """ + 在page上画出bbox,保存到save_path + """ + """ + + # {0: 'title', # 标题 + # 1: 'figure', # 图片 + # 2: 'plain text', # 文本 + # 3: 'header', # 页眉 + # 4: 'page number', # 页码 + # 5: 'footnote', # 脚注 + # 6: 'footer', # 页脚 + # 7: 'table', # 表格 + # 8: 'table caption', # 表格描述 + # 9: 'figure caption', # 图片描述 + # 10: 'equation', # 公式 + # 11: 'full column', # 单栏 + # 12: 'sub column', # 多栏 + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + + """ + + color_map = { + "body": fitz.pdfcolor["green"], + "non_body": fitz.pdfcolor["red"], + } + """ + {"layout_dets": [], "subfield_dets": [], "page_info": {"page_no": 22, "height": 1650, "width": 1275}} + """ + for i, page in enumerate(raw_pdf_doc): + v = paras_dict_arr[i] + page_idx = v["page_info"]["page_no"] + width = v["page_info"]["width"] + height = v["page_info"]["height"] + + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio( + paras_dict_arr[i], page + ) + + for order, block in enumerate(v["layout_dets"]): + L = block["poly"][0] / horizontal_scale_ratio + U = block["poly"][1] / vertical_scale_ratio + R = block["poly"][2] / horizontal_scale_ratio + D = block["poly"][5] / vertical_scale_ratio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + bbox = [L, U, R, D] + color = color_map["body"] + if block["category_id"] in (3, 4, 5, 6, 0): + color = color_map["non_body"] + + rect = fitz.Rect(bbox) + page.draw_rect(rect, fill=None, width=0.5, overlay=True, color=color) + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + raw_pdf_doc.save(save_path) + + +def debug_show_bbox( + raw_pdf_doc: fitz.Document, + page_idx: int, + bboxes: list, + droped_bboxes: list, + expect_drop_bboxes: list, + save_path: str, + expected_page_id: int, +): + """ + 以覆盖的方式写个临时的pdf,用于debug + """ + if page_idx != expected_page_id: + return + + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open("") + + width = raw_pdf_doc[page_idx].rect.width + height = raw_pdf_doc[page_idx].rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish( + color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2 + ) + shape.finish() + shape.commit() + + for bbox in droped_bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in expect_drop_bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=None) + shape.finish() + shape.commit() + + # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12, + # color=(0, 0, 0)) + # shape.finish(color=fitz.pdfcolor['black']) + # shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + + +def debug_show_page( + page, + bboxes1: list, + bboxes2: list, + bboxes3: list, +): + save_path = "./tmp/debug.pdf" + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open("") + + width = page.rect.width + height = page.rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes1: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish( + color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2 + ) + shape.finish() + shape.commit() + + for bbox in bboxes2: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in bboxes3: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=None) + shape.finish() + shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + + +def draw_layout_bbox_on_page( + raw_pdf_doc: fitz.Document, paras_dict: dict, header, footer, pdf_path: str +): + """ + 在page上画出bbox,保存到save_path + """ + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(pdf_path): + # 打开现有的 PDF 文件 + doc = fitz.open(pdf_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open("") + + for k, v in paras_dict.items(): + page_idx = v["page_idx"] + layouts = v["layout_bboxes"] + page = doc[page_idx] + shape = page.new_shape() + for order, layout in enumerate(layouts): + border_offset = 1 + rect_box = layout["layout_bbox"] + layout_label = layout["layout_label"] + fill_color = fitz.pdfcolor["pink"] if layout_label == "U" else None + rect_box = [ + rect_box[0] + 1, + rect_box[1] - border_offset, + rect_box[2] - 1, + rect_box[3] + border_offset, + ] + rect = fitz.Rect(*rect_box) + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.4) + """ + draw order text on layout box + """ + font_size = 10 + shape.insert_text( + (rect_box[0] + 1, rect_box[1] + font_size), + f"{order}", + fontsize=font_size, + color=(0, 0, 0), + ) + + """画上footer header""" + if header: + shape.draw_rect(fitz.Rect(header)) + shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2) + if footer: + shape.draw_rect(fitz.Rect(footer)) + shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2) + + shape.commit() + + if is_new_pdf: + doc.save(pdf_path) + else: + doc.saveIncr() + doc.close() + + +@DeprecationWarning +def draw_layout_on_page( + raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str +): + """ + 把layout的box用红色边框花在pdf_path的page_idx上 + """ + + def draw(shape, layout, fill_color=fitz.pdfcolor["pink"]): + border_offset = 1 + rect_box = layout["layout_bbox"] + layout_label = layout["layout_label"] + sub_layout = layout["sub_layout"] + if len(sub_layout) == 0: + fill_color = fill_color if layout_label == "U" else None + rect_box = [ + rect_box[0] + 1, + rect_box[1] - border_offset, + rect_box[2] - 1, + rect_box[3] + border_offset, + ] + rect = fitz.Rect(*rect_box) + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.2) + # if layout_label=='U': + # bad_boxes = layout.get("bad_boxes", []) + # for bad_box in bad_boxes: + # rect = fitz.Rect(*bad_box) + # shape.draw_rect(rect) + # shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2) + # else: + # rect = fitz.Rect(*rect_box) + # shape.draw_rect(rect) + # shape.finish(color=fitz.pdfcolor['blue']) + + for sub_layout in sub_layout: + draw(shape, sub_layout) + shape.commit() + + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(pdf_path): + # 打开现有的 PDF 文件 + doc = fitz.open(pdf_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open("") + + page = doc[page_idx] + shape = page.new_shape() + for order, layout in enumerate(page_layout): + draw(shape, layout, fitz.pdfcolor["yellow"]) + + # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12, + # color=(0, 0, 0)) + # shape.finish(color=fitz.pdfcolor['black']) + # shape.commit() + + parent_dir = os.path.dirname(pdf_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + if is_new_pdf: + doc.save(pdf_path) + else: + doc.saveIncr() + doc.close() diff --git a/magic_pdf/user_api.py b/magic_pdf/user_api.py new file mode 100644 index 0000000000000000000000000000000000000000..bf36477294355a716400af177ce1a7f75a2e5806 --- /dev/null +++ b/magic_pdf/user_api.py @@ -0,0 +1,103 @@ +""" +用户输入: + model数组,每个元素代表一个页面 + pdf在s3的路径 + 截图保存的s3位置 + +然后: + 1)根据s3路径,调用spark集群的api,拿到ak,sk,endpoint,构造出s3PDFReader + 2)根据用户输入的s3地址,调用spark集群的api,拿到ak,sk,endpoint,构造出s3ImageWriter + +其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖!!! + +""" +import re + +from loguru import logger + +from magic_pdf.libs.version import __version__ +from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze +from magic_pdf.rw import AbsReaderWriter +from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr +from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt + +PARSE_TYPE_TXT = "txt" +PARSE_TYPE_OCR = "ocr" + + +def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, + **kwargs): + """ + 解析文本类pdf + """ + pdf_info_dict = parse_pdf_by_txt( + pdf_bytes, + pdf_models, + imageWriter, + start_page_id=start_page, + debug_mode=is_debug, + ) + + pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT + + pdf_info_dict["_version_name"] = __version__ + + return pdf_info_dict + + +def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args, + **kwargs): + """ + 解析ocr类pdf + """ + pdf_info_dict = parse_pdf_by_ocr( + pdf_bytes, + pdf_models, + imageWriter, + start_page_id=start_page, + debug_mode=is_debug, + ) + + pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR + + pdf_info_dict["_version_name"] = __version__ + + return pdf_info_dict + + +def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, + input_model_is_empty: bool = False, + *args, **kwargs): + """ + ocr和文本混合的pdf,全部解析出来 + """ + + def parse_pdf(method): + try: + return method( + pdf_bytes, + pdf_models, + imageWriter, + start_page_id=start_page, + debug_mode=is_debug, + ) + except Exception as e: + logger.exception(e) + return None + + pdf_info_dict = parse_pdf(parse_pdf_by_txt) + if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False): + logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr") + if input_model_is_empty: + pdf_models = doc_analyze(pdf_bytes, ocr=True) + pdf_info_dict = parse_pdf(parse_pdf_by_ocr) + if pdf_info_dict is None: + raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.") + else: + pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR + else: + pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT + + pdf_info_dict["_version_name"] = __version__ + + return pdf_info_dict diff --git a/requirements-qa.txt b/requirements-qa.txt new file mode 100644 index 0000000000000000000000000000000000000000..d397f6cdfc083cf4f1d12c6f5865ea85fd7091df --- /dev/null +++ b/requirements-qa.txt @@ -0,0 +1,16 @@ +pytest +Levenshtein +nltk +rapidfuzz +statistics +openxlab #安装opendatalab +pandas +numpy +matplotlib +seaborn +scipy +scikit-learn +tqdm +htmltabletomd +pypandoc +pyopenssl==24.0.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b1665a8ec8bb80d739f0b1cc4c00e8af5a1f2f6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +boto3>=1.28.43 +Brotli>=1.1.0 +click>=8.1.7 +PyMuPDF>=1.24.7 +loguru>=0.6.0 +numpy>=1.21.6 +fast-langdetect>=0.2.1 +wordninja>=2.0.0 +scikit-learn>=1.0.2 +pdfminer.six>=20231228 +# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator. diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..4cfcde7495fb851f8e550ab074e89c8cb4a90bcd --- /dev/null +++ b/setup.py @@ -0,0 +1,51 @@ +from pathlib import Path +from setuptools import setup, find_packages +from magic_pdf.libs.version import __version__ + + +def parse_requirements(filename): + with open(filename) as f: + lines = f.read().splitlines() + + requires = [] + + for line in lines: + if "http" in line: + pkg_name_without_url = line.split('@')[0].strip() + requires.append(pkg_name_without_url) + else: + requires.append(line) + + return requires + + +if __name__ == '__main__': + with Path(Path(__file__).parent, + 'README.md').open(encoding='utf-8') as file: + long_description = file.read() + setup( + name="magic_pdf", # 项目名 + version=__version__, # 自动从tag中获取版本号 + packages=find_packages() + ["magic_pdf.resources"], # 包含所有的包 + package_data={ + "magic_pdf.resources": ["**"], # 包含magic_pdf.resources目录下的所有文件 + }, + install_requires=parse_requirements('requirements.txt'), # 项目依赖的第三方库 + extras_require={ + "gpu": ["paddleocr==2.7.3", "paddlepaddle-gpu"], + "cpu": ["paddleocr==2.7.3", "paddlepaddle"], + "full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle"], + }, + description="A practical tool for converting PDF to Markdown", # 简短描述 + long_description=long_description, # 详细描述 + long_description_content_type="text/markdown", # 如果README是Markdown格式 + url="https://github.com/opendatalab/MinerU", + python_requires=">=3.9", # 项目依赖的 Python 版本 + entry_points={ + "console_scripts": [ + "magic-pdf = magic_pdf.cli.magicpdf:cli" + ], + }, # 项目提供的可执行命令 + include_package_data=True, # 是否包含非代码文件,如数据文件、配置文件等 + zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False + ) diff --git a/update_version.py b/update_version.py new file mode 100644 index 0000000000000000000000000000000000000000..b51081625a051ded30a2f228fd67468177c59c5e --- /dev/null +++ b/update_version.py @@ -0,0 +1,27 @@ +import os +import subprocess + + +def get_version(): + command = ["git", "describe", "--tags"] + try: + version = subprocess.check_output(command).decode().strip() + version_parts = version.split("-") + if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"): + return version_parts[1] + else: + raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf--released.") + except Exception as e: + print(e) + return "0.0.0" + + +def write_version_to_commons(version): + commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py') + with open(commons_path, 'w') as f: + f.write(f'__version__ = "{version}"\n') + + +if __name__ == '__main__': + version_name = get_version() + write_version_to_commons(version_name)