diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..658f079091b24c3afa4dcd4c72d507bf986d4b08 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/demo2.pdf filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8bb192647da8edf8b7991467a40a2f52d64af052
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,85 @@
+name: Bug Report | 反馈 Bug
+description: Create a bug report for MinerU | MinerU 的 Bug 反馈
+labels: bug
+
+# We omit `title: "..."` so that the field defaults to blank. If we set it to
+# empty string, Github seems to reject this .yml file.
+
+body:
+
+  - type: textarea
+    id: description
+    attributes:
+      label: Description of the bug | 错误描述
+      description: |
+        A clear and concise description of the bug. | 简单描述遇到的问题  
+        
+    validations:
+      required: true
+  
+  - type: textarea
+    id: reproduce
+    attributes:
+      label: How to reproduce the bug | 如何复现
+      
+      # Should not word-wrap this description here.
+      description: |
+        * Explain the steps required to reproduce the bug. | 说明复现此错误所需的步骤。
+        * Include required code snippets, example files, etc. | 包含必要的代码片段、示例文件等。
+        * Describe what you expected to happen (if not obvious). | 描述你期望发生的情况。
+        * If applicable, add screenshots to help explain the problem. | 添加截图以帮助解释问题。
+        * Include any other information that could be relevant, for example information about the Python environment. | 包括任何其他可能相关的信息。
+        
+        For problems when building or installing MinerU: | 在构建或安装 MinerU 时遇到的问题:
+        * Give the **exact** build/install commands that were run. | 提供**确切**的构建/安装命令。
+        * Give the **complete** output from these commands. | 提供这些命令的**完整**输出。
+  
+    validations:
+      required: true
+
+#  - type: markdown
+#    attributes:
+#      value: |
+#        # The information below is required.
+
+
+  - type: dropdown
+    id: os_name
+    attributes:
+      label: Operating system | 操作系统
+      #multiple: true
+      options:
+        -
+        - Windows
+        - Linux
+        - MacOS
+    validations:
+      required: true
+
+  - type: dropdown
+    id: python_version
+    attributes:
+      label: Python version | Python 版本
+      #multiple: true
+      # Need quotes around `3.10` otherwise it is treated as a number and shows as `3.1`.
+      options:
+        -
+        - "3.12"
+        - "3.11"
+        - "3.10"
+        - "3.9"
+    validations:
+      required: true
+
+  - type: dropdown
+    id: device_mode
+    attributes:
+      label: Device mode | 设备模式
+      #multiple: true
+      options:
+        -
+        - cpu
+        - cuda
+        - mps
+    validations:
+      required: true
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..8eee8d403ed164e542db56d4c9ac879460aeb2dd
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,28 @@
+---
+name: Feature request | 功能需求
+about: Suggest an idea for this project | 提出一个有价值的idea
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+**您的特性请求是否与某个问题相关？请描述。**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+对存在的问题进行清晰且简洁的描述。例如：我一直很困扰的是 [...]
+
+**Describe the solution you'd like**
+**描述您期望的解决方案**
+A clear and concise description of what you want to happen.
+清晰且简洁地描述您希望实现的内容。
+
+**Describe alternatives you've considered**
+**描述您已考虑的替代方案**
+A clear and concise description of any alternative solutions or features you've considered.
+清晰且简洁地描述您已经考虑过的任何替代解决方案。
+
+**Additional context**
+**提供更多细节**
+Add any other context or screenshots about the feature request here.
+请附上任何相关截图、链接或文件，以帮助我们更好地理解您的请求。
\ No newline at end of file
diff --git a/.github/workflows/cla.yml b/.github/workflows/cla.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f37e6db08a6132d24a32b09d27143b00c7396acf
--- /dev/null
+++ b/.github/workflows/cla.yml
@@ -0,0 +1,43 @@
+name: "CLA Assistant"
+on:
+  issue_comment:
+    types: [created]
+  pull_request_target:
+    types: [opened,closed,synchronize]
+
+# explicitly configure permissions, in case your GITHUB_TOKEN workflow permissions are set to read-only in repository settings
+permissions:
+  actions: write
+  contents: write # this can be 'read' if the signatures are in remote repository
+  pull-requests: write
+  statuses: write
+
+jobs:
+  CLAAssistant:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "CLA Assistant"
+        if: (github.event.comment.body == 'recheck' || github.event.comment.body == 'I have read the CLA Document and I hereby sign the CLA') || github.event_name == 'pull_request_target'
+        uses: contributor-assistant/github-action@v2.4.0
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # the below token should have repo scope and must be manually added by you in the repository's secret
+          # This token is required only if you have configured to store the signatures in a remote repository/organization
+          PERSONAL_ACCESS_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+        with:
+          path-to-signatures: 'signatures/version1/cla.json'
+          path-to-document: 'https://github.com/cla-assistant/github-action/blob/master/SAPCLA.md' # e.g. a CLA or a DCO document
+          # branch should not be protected
+          branch: 'main'
+          allowlist: user1,bot*
+
+         # the followings are the optional inputs - If the optional inputs are not given, then default values will be taken
+          #remote-organization-name: enter the remote organization name where the signatures should be stored (Default is storing the signatures in the same repository)
+          #remote-repository-name: enter the  remote repository name where the signatures should be stored (Default is storing the signatures in the same repository)
+          #create-file-commit-message: 'For example: Creating file for storing CLA Signatures'
+          #signed-commit-message: 'For example: $contributorName has signed the CLA in $owner/$repo#$pullRequestNo'
+          #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign'
+          #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA'
+          #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.'
+          #lock-pullrequest-aftermerge: false - if you don't want this bot to automatically lock the pull request after merging (default - true)
+          #use-dco-flag: true - If you are using DCO instead of CLA
diff --git a/.github/workflows/cli.yml b/.github/workflows/cli.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5e174cbcd6ae5769829d28a3613734287826d0d4
--- /dev/null
+++ b/.github/workflows/cli.yml
@@ -0,0 +1,46 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: mineru
+on:
+  push:
+    branches:
+      - "master"
+    paths-ignore:
+      - "cmds/**"
+      - "**.md"
+  pull_request:
+    branches:
+      - "master"
+    paths-ignore:
+      - "cmds/**"
+      - "**.md"
+  workflow_dispatch:
+jobs:
+  cli-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 40
+    strategy:
+      fail-fast: true
+
+    steps:
+    - name: PDF cli
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+      
+    - name: check-requirements
+      run: |
+        pip install -r requirements.txt
+        pip install -r requirements-qa.txt
+        pip install magic-pdf
+    - name: test_cli
+      run: |
+        cp magic-pdf.template.json ~/magic-pdf.json
+        echo $GITHUB_WORKSPACE
+        cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py
+        cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_cli.py
+                                                                                                                            
+    - name: benchmark
+      run: |
+        cd $GITHUB_WORKSPACE &&  pytest -s -v tests/test_cli/test_bench.py
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e1e8f99cb9f0682b1c93b80fba5f2f498b191538
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,126 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    tags:
+      - '*released'
+  workflow_dispatch:
+
+
+jobs:
+
+  update-version:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          ref: master
+          fetch-depth: 0
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Update version.py
+        run: |
+          python update_version.py
+
+      - name: Verify version.py
+        run: |
+          ls -l magic_pdf/libs/version.py
+          cat magic_pdf/libs/version.py
+
+      - name: Commit changes
+        run: |
+          git config --local user.email "moe@myhloli.com"
+          git config --local user.name "myhloli"
+          git add magic_pdf/libs/version.py
+          if git diff-index --quiet HEAD; then
+            echo "No changes to commit"
+          else
+            git commit -m "Update version.py with new version"
+          fi
+        id: commit_changes
+
+      - name: Push changes
+        if: steps.commit_changes.outcome == 'success'
+        env:
+          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+        run: |
+          git push origin HEAD:master
+
+  build:
+    needs: [ update-version ]
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10"]
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        ref: master
+        fetch-depth: 0
+
+    - name: Verify version.py
+      run: |
+        ls -l magic_pdf/libs/version.py
+        cat magic_pdf/libs/version.py
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+
+    - name: Install wheel
+      run: |
+        python -m pip install wheel
+
+    - name: Build wheel
+      run: |
+        python setup.py bdist_wheel
+
+    - name: Upload artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: wheel-file
+        path: dist/*.whl
+        retention-days: 30
+
+  release:
+    needs: [ build ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Download artifact
+        uses: actions/download-artifact@v4
+        with:
+          name: wheel-file
+          path: dist
+
+      - name: Create and Upload Release
+        id: create_release
+        uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
+        with:
+          files: './dist/*.whl'
+        env:
+          GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
+
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
diff --git a/.github/workflows/rerun.yml b/.github/workflows/rerun.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b607019a959544c1b09f7b39c3e894725eb4e0a7
--- /dev/null
+++ b/.github/workflows/rerun.yml
@@ -0,0 +1,23 @@
+name: check-status
+
+on:
+  workflow_run:
+    workflows: [ci]
+    types: [completed]
+
+jobs:
+  on-failure:
+    runs-on: pdf
+    permissions:
+      actions: write
+    if: ${{ (github.event.workflow_run.head_branch == 'master') && github.event.workflow_run.conclusion == 'failure' && github.event.workflow_run.run_attempt < 3 }}
+    steps:
+      - run: |
+          echo 'The triggering workflow failed'
+          sleep 600
+          curl -L \
+          -X POST \
+          -H "Accept: application/vnd.github+json" \
+          -H "Authorization: Bearer ${{ github.token }}" \
+          -H "X-GitHub-Api-Version: 2022-11-28" \
+          https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.event.workflow_run.id }}/rerun-failed-jobs 
diff --git a/.github/workflows/update_base.yml b/.github/workflows/update_base.yml
new file mode 100644
index 0000000000000000000000000000000000000000..bce75a6c2dc0ce64e0cc38f3e8bbb26d907dc289
--- /dev/null
+++ b/.github/workflows/update_base.yml
@@ -0,0 +1,22 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: update-base
+on:
+  push:
+    tags:
+      - '*released'
+  workflow_dispatch:
+jobs:
+  pdf-test:
+    runs-on: pdf
+    timeout-minutes: 40
+
+
+    steps:
+    - name: update-base
+      uses: actions/checkout@v3
+    - name: start-update
+      run: |
+        echo "start test"
+  
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6de528ed7be16aad90fbf58df5a32fc2850641db
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+*.tar
+*.tar.gz
+venv*/
+envs/
+slurm_logs/
+
+sync1.sh
+data_preprocess_pj1
+data-preparation1
+__pycache__
+*.log
+*.pyc
+.vscode
+debug/
+*.ipynb
+.idea
+
+# vscode history
+.history
+
+.DS_Store
+.env
+
+bad_words/
+bak/
+
+app/tests/*
+temp/
+tmp/
+tmp
+.vscode
+.vscode/
+/tests/
+ocr_demo
+
+/app/common/__init__.py
+/magic_pdf/config/__init__.py
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000000000000000000000000000000000000..0ad25db4bd1d86c452db3f9602ccdbe172438f52
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,661 @@
+                    GNU AFFERO GENERAL PUBLIC LICENSE
+                       Version 3, 19 November 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU Affero General Public License is a free, copyleft license for
+software and other kinds of works, specifically designed to ensure
+cooperation with the community in the case of network server software.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+our General Public Licenses are intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  Developers that use our General Public Licenses protect your rights
+with two steps: (1) assert copyright on the software, and (2) offer
+you this License which gives you legal permission to copy, distribute
+and/or modify the software.
+
+  A secondary benefit of defending all users' freedom is that
+improvements made in alternate versions of the program, if they
+receive widespread use, become available for other developers to
+incorporate.  Many developers of free software are heartened and
+encouraged by the resulting cooperation.  However, in the case of
+software used on network servers, this result may fail to come about.
+The GNU General Public License permits making a modified version and
+letting the public access it on a server without ever releasing its
+source code to the public.
+
+  The GNU Affero General Public License is designed specifically to
+ensure that, in such cases, the modified source code becomes available
+to the community.  It requires the operator of a network server to
+provide the source code of the modified version running there to the
+users of that server.  Therefore, public use of a modified version, on
+a publicly accessible server, gives the public access to the source
+code of the modified version.
+
+  An older license, called the Affero General Public License and
+published by Affero, was designed to accomplish similar goals.  This is
+a different license, not a version of the Affero GPL, but Affero has
+released a new version of the Affero GPL which permits relicensing under
+this license.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU Affero General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Remote Network Interaction; Use with the GNU General Public License.
+
+  Notwithstanding any other provision of this License, if you modify the
+Program, your modified version must prominently offer all users
+interacting with it remotely through a computer network (if your version
+supports such interaction) an opportunity to receive the Corresponding
+Source of your version by providing access to the Corresponding Source
+from a network server at no charge, through some standard or customary
+means of facilitating copying of software.  This Corresponding Source
+shall include the Corresponding Source for any work covered by version 3
+of the GNU General Public License that is incorporated pursuant to the
+following paragraph.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the work with which it is combined will remain governed by version
+3 of the GNU General Public License.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU Affero General Public License from time to time.  Such new versions
+will be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU Affero General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU Affero General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU Affero General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License as published
+    by the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If your software can interact with users remotely through a computer
+network, you should also make sure that it provides a way for users to
+get its source.  For example, if your program is a web application, its
+interface could display a "Source" link that leads users to an archive
+of the code.  There are many ways you could offer source, and different
+solutions will be better for different programs; see section 13 for the
+specific requirements.
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU AGPL, see
+<https://www.gnu.org/licenses/>.
diff --git a/README.md b/README.md
index 3152f252170d13a901054ac8caf073570320353a..0e883a3420128e650e9d0a77f3e618e4ef4a17f5 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,292 @@
 ---
 title: MinerU
-emoji: 📈
-colorFrom: red
-colorTo: pink
+app_file: ./demo/app.py
 sdk: gradio
 sdk_version: 4.39.0
-app_file: app.py
-pinned: false
 ---
+<div id="top"></div>
+<div align="center">
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
+[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
+[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
+
+
+
+
+[English](README.md) | [简体中文](README_zh-CN.md)
+
+</div>
+
+<div align="center">
+
+</div>
+
+# MinerU 
+
+
+## Introduction
+
+MinerU is a one-stop, open-source, high-quality data extraction tool, includes the following primary features:
+
+- [Magic-PDF](#Magic-PDF)  PDF Document Extraction  
+- [Magic-Doc](#Magic-Doc)  Webpage & E-book Extraction
+
+
+# Magic-PDF
+
+
+## Introduction
+
+Magic-PDF is a tool designed to convert PDF documents into Markdown format, capable of processing files stored locally or on object storage supporting S3 protocol.
+
+Key features include:
+
+- Support for multiple front-end model inputs
+- Removal of headers, footers, footnotes, and page numbers
+- Human-readable layout formatting
+- Retains the original document's structure and formatting, including headings, paragraphs, lists, and more
+- Extraction and display of images and tables within markdown
+- Conversion of equations into LaTeX format
+- Automatic detection and conversion of garbled PDFs
+- Compatibility with CPU and GPU environments
+- Available for Windows, Linux, and macOS platforms
+
+
+https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
+
+
+
+## Project Panorama
+
+![Project Panorama](docs/images/project_panorama_en.png)
+
+
+## Flowchart
+
+![Flowchart](docs/images/flowchart_en.png)
+
+### Dependency repositorys
+
+- [PDF-Extract-Kit : A Comprehensive Toolkit for High-Quality PDF Content Extraction](https://github.com/opendatalab/PDF-Extract-Kit) 🚀🚀🚀
+
+## Getting Started
+
+### Requirements
+
+- Python >= 3.9
+
+Using a virtual environment is recommended to avoid potential dependency conflicts; both venv and conda are suitable. 
+For example:
+```bash
+conda create -n MinerU python=3.10
+conda activate MinerU
+```
+
+### Installation and Configuration
+
+#### 1. Install Magic-PDF
+
+Install the full-feature package with pip:
+>Note: The pip-installed package supports CPU-only and is ideal for quick tests.
+>
+>For CUDA/MPS acceleration in production, see [Acceleration Using CUDA or MPS](#4-Acceleration-Using-CUDA-or-MPS).
+
+```bash
+pip install magic-pdf[full-cpu]
+```
+The full-feature package depends on detectron2, which requires a compilation installation.   
+If you need to compile it yourself, please refer to https://github.com/facebookresearch/detectron2/issues/5114  
+Alternatively, you can directly use our precompiled whl package (limited to Python 3.10):
+
+```bash
+pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
+```
+
+
+#### 2. Downloading model weights files
+
+For detailed references, please see below [how_to_download_models](docs/how_to_download_models_en.md)
+
+After downloading the model weights, move the 'models' directory to a directory on a larger disk space, preferably an SSD.
+
+
+#### 3. Copy the Configuration File and Make Configurations
+You can get the [magic-pdf.template.json](magic-pdf.template.json) file in the repository root directory.
+```bash
+cp magic-pdf.template.json ~/magic-pdf.json
+```
+In magic-pdf.json, configure "models-dir" to point to the directory where the model weights files are located.
+
+```json
+{
+  "models-dir": "/tmp/models"
+}
+```
+
+
+#### 4. Acceleration Using CUDA or MPS
+If you have an available Nvidia GPU or are using a Mac with Apple Silicon, you can leverage acceleration with CUDA or MPS respectively.
+##### CUDA
+
+You need to install the corresponding PyTorch version according to your CUDA version.  
+This example installs the CUDA 11.8 version.More information https://pytorch.org/get-started/locally/  
+```bash
+pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+```
+Also, you need to modify the value of "device-mode" in the configuration file magic-pdf.json.  
+```json
+{
+  "device-mode":"cuda"
+}
+```
+
+##### MPS
+
+For macOS users with M-series chip devices, you can use MPS for inference acceleration.  
+You also need to modify the value of "device-mode" in the configuration file magic-pdf.json.  
+```json
+{
+  "device-mode":"mps"
+}
+```
+
+
+### Usage
+
+#### 1.Usage via Command Line
+
+###### simple
+
+```bash
+magic-pdf pdf-command --pdf "pdf_path" --inside_model true
+```
+After the program has finished, you can find the generated markdown files under the directory "/tmp/magic-pdf".  
+You can find the corresponding xxx_model.json file in the markdown directory.   
+If you intend to do secondary development on the post-processing pipeline, you can use the command:  
+```bash
+magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
+```
+In this way, you won't need to re-run the model data, making debugging more convenient.
+
+
+###### more 
+
+```bash
+magic-pdf --help
+```
+
+
+#### 2. Usage via Api
+
+###### Local
+```python
+image_writer = DiskReaderWriter(local_image_dir)
+image_dir = str(os.path.basename(local_image_dir))
+jso_useful_key = {"_pdf_type": "", "model_list": []}
+pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+pipe.pipe_classify()
+pipe.pipe_parse()
+md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+```
+
+###### Object Storage
+```python
+s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
+image_dir = "s3://img_bucket/"
+s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
+pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
+jso_useful_key = {"_pdf_type": "", "model_list": []}
+pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
+pipe.pipe_classify()
+pipe.pipe_parse()
+md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+```
+
+Demo can be referred to [demo.py](demo/demo.py)
+
+
+# Magic-Doc
+
+
+## Introduction
+
+Magic-Doc is a tool designed to convert web pages or multi-format e-books into markdown format.
+
+Key Features Include:
+
+- Web Page Extraction
+  - Cross-modal precise parsing of text, images, tables, and formula information.
+
+- E-Book Document Extraction
+  - Supports various document formats including epub, mobi, with full adaptation for text and images.
+
+- Language Type Identification
+  - Accurate recognition of 176 languages.
+
+https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
+
+
+
+https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
+
+
+
+https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
+
+
+
+
+## Project Repository
+
+- [Magic-Doc](https://github.com/InternLM/magic-doc)
+  Outstanding Webpage and E-book Extraction Tool
+
+
+# All Thanks To Our Contributors
+
+<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
+</a>
+
+
+# License Information
+
+[LICENSE.md](LICENSE.md)
+
+The project currently leverages PyMuPDF to deliver advanced functionalities; however, its adherence to the AGPL license may impose limitations on certain use cases. In upcoming iterations, we intend to explore and transition to a more permissively licensed PDF processing library to enhance user-friendliness and flexibility.
+
+
+# Acknowledgments
+
+- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
+- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
+- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
+- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
+
+
+# Citation
+
+```bibtex
+@misc{2024mineru,
+    title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
+    author={MinerU Contributors},
+    howpublished = {\url{https://github.com/opendatalab/MinerU}},
+    year={2024}
+}
+```
+
+
+# Star History
+
+<a>
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
+ </picture>
+</a>
diff --git a/README_zh-CN.md b/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..ed36ae7e11a16065d7628d230678c075fc61fc5b
--- /dev/null
+++ b/README_zh-CN.md
@@ -0,0 +1,277 @@
+<div id="top"></div>
+<div align="center">
+
+[![stars](https://img.shields.io/github/stars/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
+[![forks](https://img.shields.io/github/forks/opendatalab/MinerU.svg)](https://github.com/opendatalab/MinerU)
+[![open issues](https://img.shields.io/github/issues-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![issue resolution](https://img.shields.io/github/issues-closed-raw/opendatalab/MinerU)](https://github.com/opendatalab/MinerU/issues)
+[![PyPI version](https://badge.fury.io/py/magic-pdf.svg)](https://badge.fury.io/py/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf)](https://pepy.tech/project/magic-pdf)
+[![Downloads](https://static.pepy.tech/badge/magic-pdf/month)](https://pepy.tech/project/magic-pdf)
+
+[English](README.md) | [简体中文](README_zh-CN.md)
+
+</div>
+
+<div align="center">
+
+</div>
+
+# MinerU 
+
+
+## 简介
+
+MinerU 是一款一站式、开源、高质量的数据提取工具，主要包含以下功能:
+
+- [Magic-PDF](#Magic-PDF)  PDF文档提取
+- [Magic-Doc](#Magic-Doc)  网页与电子书提取  
+
+# Magic-PDF
+
+
+## 简介
+
+Magic-PDF 是一款将 PDF 转化为 markdown 格式的工具。支持转换本地文档或者位于支持S3协议对象存储上的文件。
+
+主要功能包含
+
+- 支持多种前端模型输入
+- 删除页眉、页脚、脚注、页码等元素
+- 符合人类阅读顺序的排版格式
+- 保留原文档的结构和格式，包括标题、段落、列表等
+- 提取图像和表格并在markdown中展示
+- 将公式转换成latex
+- 乱码PDF自动识别并转换
+- 支持cpu和gpu环境
+- 支持windows/linux/mac平台
+
+
+https://github.com/opendatalab/MinerU/assets/11393164/618937cb-dc6a-4646-b433-e3131a5f4070
+
+
+
+## 项目全景
+
+![项目全景图](docs/images/project_panorama_zh_cn.png)
+
+## 流程图
+
+![流程图](docs/images/flowchart_zh_cn.png)
+
+### 子模块仓库
+
+- [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) 
+  - 高质量的PDF内容提取工具包
+
+## 上手指南
+
+### 配置要求
+
+python >= 3.9
+
+推荐使用虚拟环境，以避免可能发生的依赖冲突，venv和conda均可使用。  
+例如：
+```bash
+conda create -n MinerU python=3.10
+conda activate MinerU
+```
+开发基于python 3.10，如果在其他版本python出现问题请切换至3.10。
+
+### 安装配置
+
+#### 1. 安装Magic-PDF
+
+使用pip安装完整功能包：
+>受pypi限制，pip安装的完整功能包仅支持cpu推理，建议只用于快速测试解析能力。
+>
+>如需在生产环境使用CUDA/MPS加速请参考[使用CUDA或MPS加速推理](#4-使用CUDA或MPS加速推理)
+```bash
+pip install magic-pdf[full-cpu]
+```
+完整功能包依赖detectron2，该库需要编译安装，如需自行编译，请参考 https://github.com/facebookresearch/detectron2/issues/5114  
+或是直接使用我们预编译的whl包(仅限python 3.10)：  
+```bash
+pip install detectron2 --extra-index-url https://myhloli.github.io/wheels/
+```
+
+#### 2. 下载模型权重文件
+
+详细参考 [如何下载模型文件](docs/how_to_download_models_zh_cn.md)  
+下载后请将models目录移动到空间较大的ssd磁盘目录  
+
+#### 3. 拷贝配置文件并进行配置
+在仓库根目录可以获得 [magic-pdf.template.json](magic-pdf.template.json) 文件
+```bash
+cp magic-pdf.template.json ~/magic-pdf.json
+```
+在magic-pdf.json中配置"models-dir"为模型权重文件所在目录
+```json
+{
+  "models-dir": "/tmp/models"
+}
+```
+
+#### 4. 使用CUDA或MPS加速推理
+如您有可用的Nvidia显卡或在使用Apple Silicon的Mac，可以使用CUDA或MPS进行加速
+##### CUDA
+
+需要根据自己的CUDA版本安装对应的pytorch版本  
+以下是对应CUDA 11.8版本的安装命令，更多信息请参考 https://pytorch.org/get-started/locally/  
+```bash
+pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 --index-url https://download.pytorch.org/whl/cu118
+```
+
+同时需要修改配置文件magic-pdf.json中"device-mode"的值
+```json
+{
+  "device-mode":"cuda"
+}
+```
+
+##### MPS
+使用macOS(M系列芯片设备)可以使用MPS进行推理加速  
+需要修改配置文件magic-pdf.json中"device-mode"的值  
+```json
+{
+  "device-mode":"mps"
+}
+```
+
+
+### 使用说明
+
+#### 1. 通过命令行使用
+
+###### 直接使用
+
+```bash
+magic-pdf pdf-command --pdf "pdf_path" --inside_model true
+```
+程序运行完成后，你可以在"/tmp/magic-pdf"目录下看到生成的markdown文件，markdown目录中可以找到对应的xxx_model.json文件  
+如果您有意对后处理pipeline进行二次开发，可以使用命令  
+```bash
+magic-pdf pdf-command --pdf "pdf_path" --model "model_json_path"
+```
+这样就不需要重跑模型数据，调试起来更方便
+
+###### 更多用法
+
+```bash
+magic-pdf --help
+```
+
+
+#### 2. 通过接口调用
+
+###### 本地使用
+```python
+image_writer = DiskReaderWriter(local_image_dir)
+image_dir = str(os.path.basename(local_image_dir))
+jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+pipe.pipe_classify()
+pipe.pipe_parse()
+md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+```
+
+###### 在对象存储上使用
+```python
+s3pdf_cli = S3ReaderWriter(pdf_ak, pdf_sk, pdf_endpoint)
+image_dir = "s3://img_bucket/"
+s3image_cli = S3ReaderWriter(img_ak, img_sk, img_endpoint, parent_path=image_dir)
+pdf_bytes = s3pdf_cli.read(s3_pdf_path, mode=s3pdf_cli.MODE_BIN)
+jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+pipe = UNIPipe(pdf_bytes, jso_useful_key, s3image_cli)
+pipe.pipe_classify()
+pipe.pipe_parse()
+md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+```
+
+详细实现可参考 [demo.py](demo/demo.py)
+
+
+### 常见问题处理解答
+
+参考 [FAQ](docs/FAQ_zh_cn.md) 
+
+
+# Magic-Doc
+
+
+## 简介
+
+Magic-Doc 是一款支持将网页或多格式电子书转换为 markdown 格式的工具。
+
+主要功能包含
+ 
+- Web网页提取
+  - 跨模态精准解析图文、表格、公式信息
+
+- 电子书文献提取
+  - 支持 epub，mobi等多格式文献，文本图片全适配
+
+- 语言类型鉴定
+  - 支持176种语言的准确识别
+
+https://github.com/opendatalab/MinerU/assets/11393164/a5a650e9-f4c0-463e-acc3-960967f1a1ca
+
+
+
+https://github.com/opendatalab/MinerU/assets/11393164/0f4a6fe9-6cca-4113-9fdc-a537749d764d
+
+
+
+https://github.com/opendatalab/MinerU/assets/11393164/20438a02-ce6c-4af8-9dde-d722a4e825b2
+
+
+
+
+## 项目仓库
+
+- [Magic-Doc](https://github.com/InternLM/magic-doc)
+  优秀的网页与电子书提取工具
+
+
+## 感谢我们的贡献者
+
+<a href="https://github.com/magicpdf/Magic-PDF/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=opendatalab/MinerU" />
+</a>
+
+
+## 版权说明
+
+[LICENSE.md](LICENSE.md)
+
+本项目目前采用PyMuPDF以实现高级功能，但因其遵循AGPL协议，可能对某些使用场景构成限制。未来版本迭代中，我们计划探索并替换为许可条款更为宽松的PDF处理库，以提升用户友好度及灵活性。
+
+
+## 致谢
+- [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR)
+- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
+- [fast-langdetect](https://github.com/LlmKira/fast-langdetect)
+- [pdfminer.six](https://github.com/pdfminer/pdfminer.six)
+
+
+# 引用
+
+```bibtex
+@misc{2024mineru,
+    title={MinerU: A One-stop, Open-source, High-quality Data Extraction Tool},
+    author={MinerU Contributors},
+    howpublished = {\url{https://github.com/opendatalab/MinerU}},
+    year={2024}
+}
+```
+
+  
+# Star History
+
+<a>
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=opendatalab/MinerU&type=Date" />
+ </picture>
+</a>
\ No newline at end of file
diff --git a/demo/app.py b/demo/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..83cbaf909de88fa82486a3e30f37f396fa99471b
--- /dev/null
+++ b/demo/app.py
@@ -0,0 +1,67 @@
+import os
+import json
+import gradio as gr
+from loguru import logger
+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+import magic_pdf.model as model_config
+
+model_config.__use_inside_model__ = True
+
+
+def process_pdf(file_path):
+    try:
+        pdf_bytes = open(file_path, "rb").read()
+        model_json = []  # model_json传空list使用内置模型解析
+        jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+        local_image_dir = os.path.join('uploads', 'images')
+        if not os.path.exists(local_image_dir):
+            os.makedirs(local_image_dir)
+        image_dir = str(os.path.basename(local_image_dir))
+        image_writer = DiskReaderWriter(local_image_dir)
+        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+        pipe.pipe_classify()
+        if len(model_json) == 0:
+            if model_config.__use_inside_model__:
+                pipe.pipe_analyze()
+            else:
+                logger.error("need model list input")
+                return None
+        pipe.pipe_parse()
+        md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+        return md_content
+    except Exception as e:
+        logger.exception(e)
+        return None
+
+
+def extract_markdown_from_pdf(pdf):
+    # 保存上传的PDF文件
+    file_path = os.path.join('uploads', pdf.name)
+    with open(file_path, 'wb') as f:
+        f.write(pdf.read())
+
+    # 处理PDF文件并生成Markdown内容
+    md_content = process_pdf(file_path)
+    return md_content
+
+
+def main():
+    # 创建Gradio接口
+    with gr.Blocks() as demo:
+        gr.Markdown("# PDF to Markdown Converter")
+
+        with gr.Row():
+            with gr.Column():
+                pdf_file = gr.File(label="Upload PDF", file_types=['.pdf'])
+                md_output = gr.Markdown(label="Extracted Markdown")
+
+        extract_button = gr.Button("Extract Markdown")
+        extract_button.click(extract_markdown_from_pdf, inputs=[
+                             pdf_file], outputs=[md_output])
+
+    demo.launch(share=True)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/demo.py b/demo/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..35d956322990a0246b92ad9d6434967a64e61ab1
--- /dev/null
+++ b/demo/demo.py
@@ -0,0 +1,31 @@
+import os
+import json
+
+from loguru import logger
+
+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+
+import magic_pdf.model as model_config 
+model_config.__use_inside_model__ = True
+
+try:
+    current_script_dir = os.path.dirname(os.path.abspath(__file__))
+    demo_name = "demo1"
+    pdf_path = os.path.join(current_script_dir, f"{demo_name}.pdf")
+    model_path = os.path.join(current_script_dir, f"{demo_name}.json")
+    pdf_bytes = open(pdf_path, "rb").read()
+    # model_json = json.loads(open(model_path, "r", encoding="utf-8").read())
+    model_json = []  # model_json传空list使用内置模型解析
+    jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+    local_image_dir = os.path.join(current_script_dir, 'images')
+    image_dir = str(os.path.basename(local_image_dir))
+    image_writer = DiskReaderWriter(local_image_dir)
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode="none")
+    with open(f"{demo_name}.md", "w", encoding="utf-8") as f:
+        f.write(md_content)
+except Exception as e:
+    logger.exception(e)
\ No newline at end of file
diff --git a/demo/demo1.json b/demo/demo1.json
new file mode 100644
index 0000000000000000000000000000000000000000..e3b5a30aab7edc2779b083257a0e0cf1473c84d9
--- /dev/null
+++ b/demo/demo1.json
@@ -0,0 +1 @@
+[{"layout_dets": [{"category_id": 2, "poly": [117.85147857666016, 198.19203186035156, 268.09375, 198.19203186035156, 268.09375, 365.4513854980469, 117.85147857666016, 365.4513854980469], "score": 1.0}, {"category_id": 2, "poly": [516.9244995117188, 193.8611297607422, 983.7249145507812, 193.8611297607422, 983.7249145507812, 288.566650390625, 516.9244995117188, 288.566650390625], "score": 0.9999980926513672}, {"category_id": 2, "poly": [119.0521469116211, 1793.3775634765625, 774.3035888671875, 1793.3775634765625, 774.3035888671875, 1842.8583984375, 119.0521469116211, 1842.8583984375], "score": 0.9999951720237732}, {"category_id": 1, "poly": [213.19744873046875, 621.9070434570312, 1290.4381103515625, 621.9070434570312, 1290.4381103515625, 733.4085693359375, 213.19744873046875, 733.4085693359375], "score": 0.9999936819076538}, {"category_id": 1, "poly": [390.47998046875, 751.6647338867188, 1108.0994873046875, 751.6647338867188, 1108.0994873046875, 774.5253295898438, 390.47998046875, 774.5253295898438], "score": 0.9999909400939941}, {"category_id": 2, "poly": [556.6760864257812, 343.6651306152344, 942.158447265625, 343.6651306152344, 942.158447265625, 368.6150207519531, 556.6760864257812, 368.6150207519531], "score": 0.9999899864196777}, {"category_id": 0, "poly": [245.8207244873047, 472.72943115234375, 1257.65380859375, 472.72943115234375, 1257.65380859375, 520.0311889648438, 245.8207244873047, 520.0311889648438], "score": 0.9999768137931824}, {"category_id": 2, "poly": [1119.6229248046875, 199.3274383544922, 1376.630859375, 199.3274383544922, 1376.630859375, 384.0538024902344, 1119.6229248046875, 384.0538024902344], "score": 0.9999668002128601}, {"category_id": 1, "poly": [118.14305114746094, 1571.5140380859375, 864.8477172851562, 1571.5140380859375, 864.8477172851562, 1594.3565673828125, 118.14305114746094, 1594.3565673828125], "score": 0.999945342540741}, {"category_id": 0, "poly": [118.69384002685547, 862.561767578125, 209.67910766601562, 862.561767578125, 209.67910766601562, 888.9332885742188, 118.69384002685547, 888.9332885742188], "score": 0.9999412298202515}, {"category_id": 1, "poly": [239.3308868408203, 550.2936401367188, 1257.6968994140625, 550.2936401367188, 1257.6968994140625, 596.7587280273438, 239.3308868408203, 596.7587280273438], "score": 0.9999355673789978}, {"category_id": 2, "poly": [117.71773529052734, 1687.8800048828125, 1379.2835693359375, 1687.8800048828125, 1379.2835693359375, 1766.3516845703125, 117.71773529052734, 1766.3516845703125], "score": 0.999925971031189}, {"category_id": 1, "poly": [115.68157958984375, 913.7571411132812, 1385.33837890625, 913.7571411132812, 1385.33837890625, 1533.5689697265625, 115.68157958984375, 1533.5689697265625], "score": 0.999893307685852}, {"category_id": 2, "poly": [1084.155517578125, 374.07135009765625, 1378.12109375, 374.07135009765625, 1378.12109375, 396.0621032714844, 1084.155517578125, 396.0621032714844], "score": 0.9371034502983093}, {"category_id": 13, "poly": [714, 1383, 767, 1383, 767, 1411, 714, 1411], "score": 0.89, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [571, 1351, 636, 1351, 636, 1380, 571, 1380], "score": 0.87, "latex": "(N_{\\mathrm{zero}})"}, {"category_id": 13, "poly": [398, 1793, 419, 1793, 419, 1815, 398, 1815], "score": 0.75, "latex": "\\copyright"}, {"category_id": 13, "poly": [116, 1509, 140, 1509, 140, 1533, 116, 1533], "score": 0.73, "latex": "\\copyright"}, {"category_id": 13, "poly": [315, 1713, 479, 1713, 479, 1739, 315, 1739], "score": 0.36, "latex": "+61\\;3\\;9450\\;8719"}, {"category_id": 13, "poly": [148, 1743, 166, 1743, 166, 1765, 148, 1765], "score": 0.35, "latex": "E"}, {"category_id": 13, "poly": [369, 1743, 387, 1743, 387, 1764, 369, 1764], "score": 0.26, "latex": "@"}, {"category_id": 15, "poly": [120.0, 338.0, 266.0, 338.0, 266.0, 374.0, 120.0, 374.0], "score": 1.0, "text": "ELSEVIER"}, {"category_id": 15, "poly": [515.0, 194.0, 986.0, 194.0, 986.0, 224.0, 515.0, 224.0], "score": 0.99, "text": "Available online at www.sciencedirect.com"}, {"category_id": 15, "poly": [599.0, 245.0, 728.0, 245.0, 728.0, 275.0, 599.0, 275.0], "score": 0.99, "text": "SCIENCE"}, {"category_id": 15, "poly": [712.0, 237.0, 905.0, 229.0, 907.0, 281.0, 714.0, 289.0], "score": 0.77, "text": "CDIRECT."}, {"category_id": 15, "poly": [116.0, 1819.0, 427.0, 1819.0, 427.0, 1847.0, 116.0, 1847.0], "score": 0.99, "text": "doi:10.1016/j.jhydrol.2005.01.006"}, {"category_id": 15, "poly": [114.0, 1793.0, 397.0, 1793.0, 397.0, 1821.0, 114.0, 1821.0], "score": 0.96, "text": "0022-1694/$ - see front matter"}, {"category_id": 15, "poly": [420.0, 1793.0, 777.0, 1793.0, 777.0, 1821.0, 420.0, 1821.0], "score": 0.98, "text": " 2005 Elsevier B.V. All rights reserved."}, {"category_id": 15, "poly": [210.0, 624.0, 1291.0, 624.0, 1291.0, 654.0, 210.0, 654.0], "score": 0.97, "text": "aSchool of Forest and Ecosystem Studies,University of Melbourne,P.O.Box 137,Heidelberg,Victoria 3084,Australia"}, {"category_id": 15, "poly": [460.0, 647.0, 1040.0, 649.0, 1039.0, 679.0, 460.0, 677.0], "score": 0.96, "text": "bCSIRODivision of Land andWater,Canberra,ACT,Australia"}, {"category_id": 15, "poly": [369.0, 679.0, 1130.0, 679.0, 1130.0, 710.0, 369.0, 710.0], "score": 0.97, "text": "cCooperative Research Centre for Catchment Hydrology, Canberra,ACT, Australia"}, {"category_id": 15, "poly": [299.0, 701.0, 1203.0, 703.0, 1203.0, 740.0, 299.0, 737.0], "score": 0.98, "text": "dDepartment of Civil and Environmental Engineering, University of Melbourne, Victoria, Australia"}, {"category_id": 15, "poly": [389.0, 750.0, 1108.0, 750.0, 1108.0, 780.0, 389.0, 780.0], "score": 0.99, "text": "Received 1 October 2003; revised 22 December 2004; accepted 3 January 2005"}, {"category_id": 15, "poly": [554.0, 340.0, 945.0, 337.0, 945.0, 374.0, 554.0, 376.0], "score": 0.98, "text": "Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [247.0, 477.0, 1252.0, 477.0, 1252.0, 520.0, 247.0, 520.0], "score": 0.99, "text": "The response of flow duration curves to afforestation"}, {"category_id": 15, "poly": [1165.0, 212.0, 1285.0, 218.0, 1283.0, 256.0, 1164.0, 251.0], "score": 1.0, "text": "Journal"}, {"category_id": 15, "poly": [1171.0, 260.0, 1207.0, 260.0, 1207.0, 290.0, 1171.0, 290.0], "score": 0.84, "text": "of"}, {"category_id": 15, "poly": [1157.0, 290.0, 1379.0, 297.0, 1378.0, 351.0, 1155.0, 343.0], "score": 1.0, "text": "Hydrology"}, {"category_id": 15, "poly": [1164.0, 374.0, 1368.0, 374.0, 1368.0, 389.0, 1164.0, 389.0], "score": 0.73, "text": "nuriarnom/laonta/ihrdr"}, {"category_id": 15, "poly": [116.0, 1572.0, 868.0, 1572.0, 868.0, 1600.0, 116.0, 1600.0], "score": 0.99, "text": "Keywords: Afforestation; Flow duration curves; Flow reduction; Paired catchments"}, {"category_id": 15, "poly": [116.0, 862.0, 213.0, 862.0, 213.0, 894.0, 116.0, 894.0], "score": 1.0, "text": "Abstract"}, {"category_id": 15, "poly": [238.0, 557.0, 1254.0, 557.0, 1254.0, 600.0, 238.0, 600.0], "score": 0.94, "text": "Patrick N.J. Lanea,c,*, Alice E. Bestb,c,d, Klaus Hickelb;c, Lu Zhangbc"}, {"category_id": 15, "poly": [127.0, 1681.0, 1381.0, 1683.0, 1381.0, 1720.0, 127.0, 1718.0], "score": 0.98, "text": "* Corresponding author. Address: Forest Science Centre, Department of Sustainability and Environment, P.O. Box 137, Heidelberg, Vic."}, {"category_id": 15, "poly": [114.0, 1711.0, 314.0, 1714.0, 314.0, 1744.0, 114.0, 1741.0], "score": 0.97, "text": "3084,Australia.Tel.:"}, {"category_id": 15, "poly": [480.0, 1711.0, 702.0, 1714.0, 702.0, 1744.0, 480.0, 1741.0], "score": 0.93, "text": ";fax: +61 3 9450 8644."}, {"category_id": 15, "poly": [167.0, 1744.0, 368.0, 1744.0, 368.0, 1772.0, 167.0, 1772.0], "score": 1.0, "text": "mailaddress:patrickl"}, {"category_id": 15, "poly": [388.0, 1744.0, 657.0, 1744.0, 657.0, 1772.0, 388.0, 1772.0], "score": 1.0, "text": "unimelb.edu.au (P.N.J. Lane)."}, {"category_id": 15, "poly": [137.0, 912.0, 1385.0, 912.0, 1385.0, 948.0, 137.0, 948.0], "score": 0.98, "text": " The hydrologic effect of replacing pasture or other short crops with trees is reasonably well understood on a mean annual"}, {"category_id": 15, "poly": [116.0, 946.0, 1383.0, 946.0, 1383.0, 976.0, 116.0, 976.0], "score": 0.99, "text": "basis. The impact on fow regime, as described by the annual flow duration curve (FDC) is less certain. A method to assess the"}, {"category_id": 15, "poly": [114.0, 974.0, 1383.0, 974.0, 1383.0, 1010.0, 114.0, 1010.0], "score": 0.99, "text": "impact of plantation establishment on FDCs was developed. The starting point for the analyses was the assumption that rainfall"}, {"category_id": 15, "poly": [116.0, 1008.0, 1381.0, 1008.0, 1381.0, 1038.0, 116.0, 1038.0], "score": 0.99, "text": "and vegetation age are the principal drivers of evapotranspiration. A key objective was to remove the variability in the rainfall"}, {"category_id": 15, "poly": [116.0, 1041.0, 1381.0, 1041.0, 1381.0, 1071.0, 116.0, 1071.0], "score": 0.99, "text": "signal, leaving changes in streamflow solely attributable to the evapotranspiration of the plantation. A method was developed to"}, {"category_id": 15, "poly": [116.0, 1073.0, 1381.0, 1073.0, 1381.0, 1103.0, 116.0, 1103.0], "score": 0.98, "text": "(1) fit a model to the observed annual time series of FDC percentiles; i.e. 1oth percentile for each year of record with annual"}, {"category_id": 15, "poly": [114.0, 1101.0, 1381.0, 1103.0, 1381.0, 1133.0, 114.0, 1131.0], "score": 0.99, "text": "rainfall and plantation age as parameters, (2) replace the annual rainfall variation with the long term mean to obtain climate"}, {"category_id": 15, "poly": [118.0, 1135.0, 1383.0, 1135.0, 1383.0, 1165.0, 118.0, 1165.0], "score": 0.99, "text": "adjusted FDCs, and (3) quantify changes in FDC percentiles as plantations age. Data from 10 catchments from Australia, South"}, {"category_id": 15, "poly": [118.0, 1165.0, 1381.0, 1165.0, 1381.0, 1195.0, 118.0, 1195.0], "score": 0.99, "text": "Africa and New Zealand were used. The model was able to represent flow variation for the majority of percentiles at eight of the"}, {"category_id": 15, "poly": [114.0, 1191.0, 1383.0, 1193.0, 1383.0, 1230.0, 114.0, 1228.0], "score": 0.98, "text": "10 catchments, particularly for the 10-50th percentiles. The adjusted FDCs revealed variable patterns in flow reductions with"}, {"category_id": 15, "poly": [116.0, 1230.0, 1379.0, 1230.0, 1379.0, 1260.0, 116.0, 1260.0], "score": 0.98, "text": "two types of responses (groups) being identified. Group 1 catchments show a substantial increase in the number of zero fow"}, {"category_id": 15, "poly": [114.0, 1258.0, 1381.0, 1260.0, 1381.0, 1290.0, 114.0, 1288.0], "score": 0.98, "text": "days, with low flows being more affected than high flows. Group 2 catchments show a more uniform reduction in flows across"}, {"category_id": 15, "poly": [116.0, 1292.0, 1383.0, 1292.0, 1383.0, 1322.0, 116.0, 1322.0], "score": 0.98, "text": "all percentiles. The differences may be partly explained by storage characteristics. The modelled fow reductions were in accord"}, {"category_id": 15, "poly": [116.0, 1322.0, 1381.0, 1322.0, 1381.0, 1352.0, 116.0, 1352.0], "score": 1.0, "text": "with published results of paired catchment experiments. An additional analysis was performed to characterise the impact of"}, {"category_id": 15, "poly": [116.0, 1417.0, 1381.0, 1417.0, 1381.0, 1447.0, 116.0, 1447.0], "score": 1.0, "text": "in the occurrence of any given flow in response to afforestation. The methods used in this study proved satisfactory in removing"}, {"category_id": 15, "poly": [116.0, 1449.0, 1383.0, 1449.0, 1383.0, 1479.0, 116.0, 1479.0], "score": 0.99, "text": "the rainfall variability, and have added useful insight into the hydrologic impacts of plantation establishment. This approach"}, {"category_id": 15, "poly": [116.0, 1479.0, 1379.0, 1479.0, 1379.0, 1509.0, 116.0, 1509.0], "score": 0.99, "text": "provides a methodology for understanding catchment response to afforestation, where paired catchment data is not available."}, {"category_id": 15, "poly": [114.0, 1382.0, 713.0, 1387.0, 713.0, 1417.0, 114.0, 1413.0], "score": 0.98, "text": "when adjusted for climate, indicated a significant increase in"}, {"category_id": 15, "poly": [768.0, 1382.0, 1381.0, 1387.0, 1381.0, 1417.0, 768.0, 1413.0], "score": 0.98, "text": ".The zero flow day method could be used to determine change"}, {"category_id": 15, "poly": [116.0, 1354.0, 570.0, 1354.0, 570.0, 1385.0, 116.0, 1385.0], "score": 0.98, "text": "afforestation on the number of zero flow days"}, {"category_id": 15, "poly": [637.0, 1354.0, 1383.0, 1354.0, 1383.0, 1385.0, 637.0, 1385.0], "score": 0.99, "text": "for the catchments in group 1. This model performed particularly well, and"}, {"category_id": 15, "poly": [141.0, 1507.0, 541.0, 1509.0, 541.0, 1539.0, 141.0, 1537.0], "score": 0.98, "text": "2005 Elsevier B.V. All rights reserved."}, {"category_id": 15, "poly": [1080.0, 368.0, 1383.0, 365.0, 1383.0, 402.0, 1080.0, 404.0], "score": 0.99, "text": "www.elsevier.com/locate/jhydrol"}], "page_info": {"page_no": 0, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [130.931640625, 251.82516479492188, 312.8154296875, 251.82516479492188, 312.8154296875, 283.4620056152344, 130.931640625, 283.4620056152344], "score": 0.9999987483024597}, {"category_id": 4, "poly": [794.2171020507812, 763.5051879882812, 1396.4493408203125, 763.5051879882812, 1396.4493408203125, 818.8292236328125, 794.2171020507812, 818.8292236328125], "score": 0.9999982714653015}, {"category_id": 1, "poly": [130.19113159179688, 1017.6807861328125, 732.7059326171875, 1017.6807861328125, 732.7059326171875, 1849.8070068359375, 130.19113159179688, 1849.8070068359375], "score": 0.9999954104423523}, {"category_id": 1, "poly": [793.3727416992188, 1280.632568359375, 1397.07080078125, 1280.632568359375, 1397.07080078125, 1849.0452880859375, 793.3727416992188, 1849.0452880859375], "score": 0.9999947547912598}, {"category_id": 1, "poly": [793.5277099609375, 849.8186645507812, 1397.0140380859375, 849.8186645507812, 1397.0140380859375, 1280.6221923828125, 793.5277099609375, 1280.6221923828125], "score": 0.999994158744812}, {"category_id": 1, "poly": [130.5381317138672, 317.5604248046875, 731.9227905273438, 317.5604248046875, 731.9227905273438, 1015.91748046875, 130.5381317138672, 1015.91748046875], "score": 0.9999940395355225}, {"category_id": 2, "poly": [130.44467163085938, 194.42764282226562, 166.39125061035156, 194.42764282226562, 166.39125061035156, 215.1434783935547, 130.44467163085938, 215.1434783935547], "score": 0.999992847442627}, {"category_id": 2, "poly": [479.5857849121094, 195.1154022216797, 1045.4803466796875, 195.1154022216797, 1045.4803466796875, 218.7963104248047, 479.5857849121094, 218.7963104248047], "score": 0.99998939037323}, {"category_id": 3, "poly": [799.3821411132812, 256.1320495605469, 1390.73681640625, 256.1320495605469, 1390.73681640625, 742.4434204101562, 799.3821411132812, 742.4434204101562], "score": 0.9999882578849792}, {"category_id": 13, "poly": [984, 1180, 1065, 1180, 1065, 1211, 984, 1211], "score": 0.88, "latex": "<20\\%"}, {"category_id": 13, "poly": [128, 1415, 183, 1415, 183, 1445, 128, 1445], "score": 0.86, "latex": "95\\%"}, {"category_id": 13, "poly": [573, 618, 723, 618, 723, 649, 573, 649], "score": 0.67, "latex": "400\u2013500\\;\\mathrm{mm}"}, {"category_id": 15, "poly": [127.0, 249.0, 316.0, 254.0, 315.0, 291.0, 126.0, 286.0], "score": 1.0, "text": "1. Introduction"}, {"category_id": 15, "poly": [793.0, 765.0, 1394.0, 765.0, 1394.0, 793.0, 793.0, 793.0], "score": 0.98, "text": "Fig. 1. Annual flow duration curves of daily flows from Pine Creek,"}, {"category_id": 15, "poly": [793.0, 793.0, 999.0, 793.0, 999.0, 821.0, 793.0, 821.0], "score": 0.97, "text": "Australia, 1989-2000."}, {"category_id": 15, "poly": [161.0, 1017.0, 735.0, 1017.0, 735.0, 1054.0, 161.0, 1054.0], "score": 0.98, "text": "Zhang et al. (1999, 2001) developed simple and"}, {"category_id": 15, "poly": [127.0, 1051.0, 735.0, 1051.0, 735.0, 1088.0, 127.0, 1088.0], "score": 0.99, "text": "easily parameterised models to predict changes in"}, {"category_id": 15, "poly": [129.0, 1086.0, 730.0, 1086.0, 730.0, 1116.0, 129.0, 1116.0], "score": 0.99, "text": "mean annual fows following afforestation. However,"}, {"category_id": 15, "poly": [129.0, 1120.0, 732.0, 1120.0, 732.0, 1150.0, 129.0, 1150.0], "score": 0.98, "text": "there is a need to consider the annual flow regime as the"}, {"category_id": 15, "poly": [129.0, 1152.0, 732.0, 1152.0, 732.0, 1182.0, 129.0, 1182.0], "score": 0.99, "text": "relative changes in high and low flows may have"}, {"category_id": 15, "poly": [129.0, 1187.0, 730.0, 1187.0, 730.0, 1217.0, 129.0, 1217.0], "score": 0.98, "text": "considerable site specific and downstream impacts.."}, {"category_id": 15, "poly": [129.0, 1219.0, 732.0, 1219.0, 732.0, 1249.0, 129.0, 1249.0], "score": 0.99, "text": "Sikka et al. (2003) recently showed a change from"}, {"category_id": 15, "poly": [127.0, 1249.0, 734.0, 1247.0, 735.0, 1284.0, 127.0, 1286.0], "score": 1.0, "text": "grassland to Eucalyptus globulus plantations in India"}, {"category_id": 15, "poly": [129.0, 1284.0, 728.0, 1284.0, 728.0, 1314.0, 129.0, 1314.0], "score": 0.98, "text": "decreased alow flow index by a factor of two during the"}, {"category_id": 15, "poly": [127.0, 1316.0, 735.0, 1316.0, 735.0, 1352.0, 127.0, 1352.0], "score": 0.99, "text": "first rotation (9 years), and by 3.75 during the second"}, {"category_id": 15, "poly": [129.0, 1352.0, 732.0, 1352.0, 732.0, 1382.0, 129.0, 1382.0], "score": 1.0, "text": "rotation, with more subdued impact on peak flows. The"}, {"category_id": 15, "poly": [129.0, 1385.0, 732.0, 1385.0, 732.0, 1415.0, 129.0, 1415.0], "score": 0.99, "text": "index was defined as the 10 day average flow exceeded"}, {"category_id": 15, "poly": [125.0, 1447.0, 735.0, 1449.0, 734.0, 1486.0, 125.0, 1483.0], "score": 0.98, "text": "duration curves. Scott and Smith (1997) reported"}, {"category_id": 15, "poly": [129.0, 1486.0, 732.0, 1486.0, 732.0, 1516.0, 129.0, 1516.0], "score": 0.96, "text": "proportionally greater reductions in low fows"}, {"category_id": 15, "poly": [125.0, 1511.0, 737.0, 1514.0, 737.0, 1550.0, 125.0, 1548.0], "score": 0.98, "text": "(75-100th percentiles) than annual flows from South"}, {"category_id": 15, "poly": [127.0, 1548.0, 735.0, 1550.0, 734.0, 1580.0, 127.0, 1578.0], "score": 0.99, "text": "African research catchments under conversions from"}, {"category_id": 15, "poly": [125.0, 1582.0, 737.0, 1580.0, 737.0, 1617.0, 125.0, 1619.0], "score": 0.98, "text": " grass to pine and eucalypt plantations, while Bosch"}, {"category_id": 15, "poly": [129.0, 1619.0, 732.0, 1619.0, 732.0, 1649.0, 129.0, 1649.0], "score": 0.98, "text": "(1979) found the greatest reduction in seasonal flow"}, {"category_id": 15, "poly": [129.0, 1651.0, 732.0, 1651.0, 732.0, 1681.0, 129.0, 1681.0], "score": 0.98, "text": "from the summer wet season. Fahey and Jackson"}, {"category_id": 15, "poly": [125.0, 1679.0, 735.0, 1681.0, 734.0, 1718.0, 125.0, 1716.0], "score": 0.99, "text": "(1997) reported the reduction in peak flows was twice"}, {"category_id": 15, "poly": [129.0, 1718.0, 732.0, 1718.0, 732.0, 1748.0, 129.0, 1748.0], "score": 0.98, "text": "that of total flow and low flows for pine afforestation in"}, {"category_id": 15, "poly": [125.0, 1746.0, 732.0, 1748.0, 732.0, 1785.0, 125.0, 1782.0], "score": 0.98, "text": " New Zealand. The generalisations that can be drawn"}, {"category_id": 15, "poly": [129.0, 1784.0, 728.0, 1784.0, 728.0, 1815.0, 129.0, 1815.0], "score": 0.99, "text": "from annual analyses, where processes and hydrologic"}, {"category_id": 15, "poly": [127.0, 1819.0, 732.0, 1817.0, 732.0, 1847.0, 127.0, 1849.0], "score": 0.99, "text": "responses are to a certain extent integrated may not"}, {"category_id": 15, "poly": [184.0, 1415.0, 732.0, 1417.0, 732.0, 1447.0, 184.0, 1445.0], "score": 0.99, "text": "of the time, obtained from analysis of 10-day flow"}, {"category_id": 15, "poly": [823.0, 1277.0, 1400.0, 1279.0, 1400.0, 1316.0, 823.0, 1314.0], "score": 0.98, "text": " This paper presents the results of a project aimed at"}, {"category_id": 15, "poly": [793.0, 1316.0, 1398.0, 1316.0, 1398.0, 1346.0, 793.0, 1346.0], "score": 0.96, "text": "quantifying changes in annual fow regime of"}, {"category_id": 15, "poly": [793.0, 1350.0, 1398.0, 1350.0, 1398.0, 1380.0, 793.0, 1380.0], "score": 0.99, "text": "catchments following plantation establishment. The"}, {"category_id": 15, "poly": [793.0, 1385.0, 1398.0, 1385.0, 1398.0, 1415.0, 793.0, 1415.0], "score": 0.98, "text": "flow regime is represented by the flow duration curve"}, {"category_id": 15, "poly": [793.0, 1417.0, 1398.0, 1417.0, 1398.0, 1447.0, 793.0, 1447.0], "score": 0.99, "text": "(FDC). The key assumption was that rainfall and"}, {"category_id": 15, "poly": [793.0, 1451.0, 1396.0, 1451.0, 1396.0, 1481.0, 793.0, 1481.0], "score": 0.99, "text": "forest age are the principal drivers of evapotranspira-"}, {"category_id": 15, "poly": [788.0, 1481.0, 1400.0, 1481.0, 1400.0, 1518.0, 788.0, 1518.0], "score": 0.99, "text": "tion. For any generalisation of response of the FDC to"}, {"category_id": 15, "poly": [793.0, 1518.0, 1398.0, 1518.0, 1398.0, 1548.0, 793.0, 1548.0], "score": 0.99, "text": "vegetation change, the variation in the annual climate"}, {"category_id": 15, "poly": [790.0, 1550.0, 1398.0, 1550.0, 1398.0, 1580.0, 790.0, 1580.0], "score": 0.97, "text": "signal must be removed. The time-tested solution to"}, {"category_id": 15, "poly": [790.0, 1585.0, 1398.0, 1585.0, 1398.0, 1615.0, 790.0, 1615.0], "score": 1.0, "text": "this problem is the paired-catchment (control versus"}, {"category_id": 15, "poly": [790.0, 1617.0, 1398.0, 1617.0, 1398.0, 1647.0, 790.0, 1647.0], "score": 0.98, "text": "treatment) experiment. The benefits in such studies"}, {"category_id": 15, "poly": [793.0, 1651.0, 1396.0, 1651.0, 1396.0, 1681.0, 793.0, 1681.0], "score": 0.98, "text": "are manifold: unambiguous measures of trends,"}, {"category_id": 15, "poly": [790.0, 1686.0, 1392.0, 1686.0, 1392.0, 1716.0, 790.0, 1716.0], "score": 0.99, "text": "insights into the processes driving those trends,"}, {"category_id": 15, "poly": [793.0, 1716.0, 1400.0, 1716.0, 1400.0, 1752.0, 793.0, 1752.0], "score": 0.96, "text": "excellent opportunities for model parameterisation"}, {"category_id": 15, "poly": [793.0, 1750.0, 1394.0, 1750.0, 1394.0, 1780.0, 793.0, 1780.0], "score": 0.98, "text": "and validation. However these data are not readily"}, {"category_id": 15, "poly": [790.0, 1784.0, 1390.0, 1784.0, 1390.0, 1815.0, 790.0, 1815.0], "score": 0.99, "text": "available for the range of treamtments and environ-"}, {"category_id": 15, "poly": [790.0, 1817.0, 1396.0, 1817.0, 1396.0, 1847.0, 790.0, 1847.0], "score": 0.99, "text": " ments required. Consequently, the aims of this project"}, {"category_id": 15, "poly": [793.0, 851.0, 1398.0, 851.0, 1398.0, 882.0, 793.0, 882.0], "score": 0.99, "text": "apply on a seasonal or shorter scale. Further, the"}, {"category_id": 15, "poly": [788.0, 879.0, 1398.0, 882.0, 1398.0, 918.0, 788.0, 916.0], "score": 1.0, "text": " observed impacts of any land use change on flows may"}, {"category_id": 15, "poly": [788.0, 916.0, 1400.0, 916.0, 1400.0, 952.0, 788.0, 952.0], "score": 0.96, "text": "be exaggerated or understated depending on the"}, {"category_id": 15, "poly": [788.0, 948.0, 1400.0, 948.0, 1400.0, 985.0, 788.0, 985.0], "score": 0.99, "text": "prevailing climate. Observations of fow during"}, {"category_id": 15, "poly": [793.0, 985.0, 1398.0, 985.0, 1398.0, 1015.0, 793.0, 1015.0], "score": 0.98, "text": "extended wet or dry spells, or with high annual"}, {"category_id": 15, "poly": [793.0, 1017.0, 1398.0, 1017.0, 1398.0, 1047.0, 793.0, 1047.0], "score": 1.0, "text": "variability can obscure the real impacts. Fig. 1 plots"}, {"category_id": 15, "poly": [790.0, 1051.0, 1398.0, 1051.0, 1398.0, 1081.0, 790.0, 1081.0], "score": 0.98, "text": " annual FDCs over 12 years of plantation growth for one"}, {"category_id": 15, "poly": [793.0, 1084.0, 1398.0, 1084.0, 1398.0, 1114.0, 793.0, 1114.0], "score": 0.99, "text": "of the catchments used in this study, Pine Creek. The"}, {"category_id": 15, "poly": [786.0, 1114.0, 1400.0, 1116.0, 1400.0, 1152.0, 786.0, 1150.0], "score": 0.97, "text": " net change in flow is obscured by rainfall variability;"}, {"category_id": 15, "poly": [788.0, 1148.0, 1400.0, 1146.0, 1400.0, 1182.0, 788.0, 1185.0], "score": 1.0, "text": "e.g. the greatest change in the FDC is in 1996, with the"}, {"category_id": 15, "poly": [786.0, 1215.0, 1398.0, 1213.0, 1398.0, 1249.0, 786.0, 1251.0], "score": 0.99, "text": " compared with 2000, where there is substantially"}, {"category_id": 15, "poly": [788.0, 1249.0, 941.0, 1249.0, 941.0, 1279.0, 788.0, 1279.0], "score": 0.99, "text": "higher flows."}, {"category_id": 15, "poly": [788.0, 1180.0, 983.0, 1180.0, 983.0, 1217.0, 788.0, 1217.0], "score": 0.96, "text": "stream flowing"}, {"category_id": 15, "poly": [1066.0, 1180.0, 1400.0, 1180.0, 1400.0, 1217.0, 1066.0, 1217.0], "score": 0.96, "text": " of the time. This may be"}, {"category_id": 15, "poly": [161.0, 318.0, 728.0, 318.0, 728.0, 355.0, 161.0, 355.0], "score": 1.0, "text": "Widespread afforestation through plantation estab-"}, {"category_id": 15, "poly": [125.0, 348.0, 732.0, 350.0, 732.0, 387.0, 125.0, 385.0], "score": 1.0, "text": "lishment on non-forested land represents a potentially"}, {"category_id": 15, "poly": [129.0, 389.0, 732.0, 389.0, 732.0, 417.0, 129.0, 417.0], "score": 0.98, "text": "significant alteration of catchment evapotranspiration"}, {"category_id": 15, "poly": [129.0, 421.0, 730.0, 421.0, 730.0, 452.0, 129.0, 452.0], "score": 0.98, "text": "(ET). Using data collated from multiple catchment"}, {"category_id": 15, "poly": [129.0, 456.0, 732.0, 456.0, 732.0, 484.0, 129.0, 484.0], "score": 0.99, "text": "studies, researchers have demonstrated a consistent"}, {"category_id": 15, "poly": [125.0, 482.0, 737.0, 484.0, 737.0, 520.0, 125.0, 518.0], "score": 0.98, "text": " difference in ET between forests and grass or short "}, {"category_id": 15, "poly": [122.0, 518.0, 734.0, 516.0, 735.0, 553.0, 123.0, 555.0], "score": 0.99, "text": " crops, and the relationship between ET and rainfall on"}, {"category_id": 15, "poly": [127.0, 553.0, 732.0, 553.0, 732.0, 583.0, 127.0, 583.0], "score": 1.0, "text": "a mean annual basis (Holmes and Sinclair, 1986;"}, {"category_id": 15, "poly": [127.0, 585.0, 732.0, 585.0, 732.0, 621.0, 127.0, 621.0], "score": 0.99, "text": "Vertessy and Bessard, 1999; Zhang et al., 1999,"}, {"category_id": 15, "poly": [129.0, 654.0, 732.0, 654.0, 732.0, 684.0, 129.0, 684.0], "score": 0.99, "text": "there is an increasing divergence between forest and"}, {"category_id": 15, "poly": [125.0, 684.0, 734.0, 682.0, 735.0, 718.0, 125.0, 720.0], "score": 0.99, "text": "grassland ET (Zhang et al., 2001). Research from"}, {"category_id": 15, "poly": [127.0, 718.0, 732.0, 718.0, 732.0, 755.0, 127.0, 755.0], "score": 0.98, "text": "South Africa in particular has demonstrated flow"}, {"category_id": 15, "poly": [129.0, 755.0, 730.0, 755.0, 730.0, 785.0, 129.0, 785.0], "score": 1.0, "text": "reduction following afforestation with both pine and"}, {"category_id": 15, "poly": [125.0, 783.0, 732.0, 780.0, 732.0, 817.0, 125.0, 819.0], "score": 0.99, "text": "eucalypt species (Bosch, 1979; Van Lill et al., 1980;"}, {"category_id": 15, "poly": [131.0, 819.0, 732.0, 819.0, 732.0, 849.0, 131.0, 849.0], "score": 0.98, "text": "Van Wyk, 1987; Bosch and Von Gadow, 1990; Scott"}, {"category_id": 15, "poly": [129.0, 854.0, 730.0, 854.0, 730.0, 884.0, 129.0, 884.0], "score": 0.99, "text": "and Smith, 1997; Scott et al., 2000). In regions, where"}, {"category_id": 15, "poly": [129.0, 888.0, 732.0, 888.0, 732.0, 918.0, 129.0, 918.0], "score": 1.0, "text": "water is an increasingly valuable resource, prediction"}, {"category_id": 15, "poly": [125.0, 914.0, 735.0, 916.0, 734.0, 952.0, 125.0, 950.0], "score": 1.0, "text": " of the long-term hydrologic impact of afforestation is"}, {"category_id": 15, "poly": [127.0, 952.0, 732.0, 952.0, 732.0, 983.0, 127.0, 983.0], "score": 1.0, "text": "a prerequisite for the optimal planning of catchment"}, {"category_id": 15, "poly": [126.0, 982.0, 232.0, 987.0, 231.0, 1017.0, 124.0, 1012.0], "score": 0.98, "text": "land use."}, {"category_id": 15, "poly": [129.0, 619.0, 572.0, 619.0, 572.0, 649.0, 129.0, 649.0], "score": 0.97, "text": "2001). Once annual rainfall exceeds "}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 234.0, 127.0, 234.0], "score": 0.86, "text": "254"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}], "page_info": {"page_no": 1, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [117.54735565185547, 651.1103515625, 250.780029296875, 651.1103515625, 250.780029296875, 683.0104370117188, 117.54735565185547, 683.0104370117188], "score": 0.9999984502792358}, {"category_id": 0, "poly": [118.68109130859375, 719.37060546875, 523.2320556640625, 719.37060546875, 523.2320556640625, 748.71435546875, 118.68109130859375, 748.71435546875], "score": 0.9999982714653015}, {"category_id": 1, "poly": [782.3466796875, 254.3662872314453, 1379.406005859375, 254.3662872314453, 1379.406005859375, 382.8451843261719, 782.3466796875, 382.8451843261719], "score": 0.9999969005584717}, {"category_id": 2, "poly": [466.16595458984375, 194.14617919921875, 1030.9322509765625, 194.14617919921875, 1030.9322509765625, 218.86849975585938, 466.16595458984375, 218.86849975585938], "score": 0.9999963641166687}, {"category_id": 9, "poly": [1347.212890625, 1178.8819580078125, 1379.9034423828125, 1178.8819580078125, 1379.9034423828125, 1209.0960693359375, 1347.212890625, 1209.0960693359375], "score": 0.9999951124191284}, {"category_id": 1, "poly": [118.17451477050781, 252.63734436035156, 717.2734375, 252.63734436035156, 717.2734375, 582.23974609375, 118.17451477050781, 582.23974609375], "score": 0.999994158744812}, {"category_id": 1, "poly": [780.9387817382812, 518.9439697265625, 1381.2352294921875, 518.9439697265625, 1381.2352294921875, 1114.6259765625, 780.9387817382812, 1114.6259765625], "score": 0.9999930262565613}, {"category_id": 9, "poly": [1346.75439453125, 438.8963317871094, 1380.3604736328125, 438.8963317871094, 1380.3604736328125, 467.5118713378906, 1346.75439453125, 467.5118713378906], "score": 0.9999922513961792}, {"category_id": 1, "poly": [781.1512451171875, 1283.9832763671875, 1380.4686279296875, 1283.9832763671875, 1380.4686279296875, 1845.6868896484375, 781.1512451171875, 1845.6868896484375], "score": 0.9999905824661255}, {"category_id": 1, "poly": [118.1343994140625, 788.8043212890625, 716.4190673828125, 788.8043212890625, 716.4190673828125, 1282.203125, 118.1343994140625, 1282.203125], "score": 0.9999904632568359}, {"category_id": 2, "poly": [1346.32177734375, 194.7462615966797, 1381.36328125, 194.7462615966797, 1381.36328125, 216.9466552734375, 1346.32177734375, 216.9466552734375], "score": 0.9999903440475464}, {"category_id": 1, "poly": [117.631591796875, 1283.8558349609375, 716.6098022460938, 1283.8558349609375, 716.6098022460938, 1847.49853515625, 117.631591796875, 1847.49853515625], "score": 0.9999891519546509}, {"category_id": 8, "poly": [778.0137939453125, 1156.5975341796875, 1201.7086181640625, 1156.5975341796875, 1201.7086181640625, 1238.48828125, 778.0137939453125, 1238.48828125], "score": 0.9998936653137207}, {"category_id": 8, "poly": [779.0469360351562, 433.1261901855469, 996.4776000976562, 433.1261901855469, 996.4776000976562, 470.7110595703125, 779.0469360351562, 470.7110595703125], "score": 0.979882001876831}, {"category_id": 14, "poly": [777, 1156, 1200, 1156, 1200, 1237, 777, 1237], "score": 0.92, "latex": "Q_{\\mathcal{U}}=a+b(\\Delta P)+\\frac{Y}{1+\\exp\\!\\left(\\frac{T-T_{\\mathrm{half}}}{S}\\right)}"}, {"category_id": 13, "poly": [1150, 520, 1201, 520, 1201, 551, 1150, 551], "score": 0.9, "latex": "f(P)"}, {"category_id": 13, "poly": [1210, 1384, 1262, 1384, 1262, 1414, 1210, 1414], "score": 0.9, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [856, 520, 897, 520, 897, 550, 856, 550], "score": 0.9, "latex": "Q_{\\mathcal{k}}"}, {"category_id": 13, "poly": [930, 552, 982, 552, 982, 584, 930, 584], "score": 0.89, "latex": "g(T)"}, {"category_id": 13, "poly": [857, 1285, 898, 1285, 898, 1315, 857, 1315], "score": 0.89, "latex": "Q_{\\mathcal{k}}"}, {"category_id": 13, "poly": [1196, 1649, 1278, 1649, 1278, 1678, 1196, 1678], "score": 0.89, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 13, "poly": [1270, 1483, 1311, 1483, 1311, 1515, 1270, 1515], "score": 0.89, "latex": "Q_{\\mathrm{\\small{\\mathscr{k}}}}"}, {"category_id": 13, "poly": [1259, 1418, 1301, 1418, 1301, 1449, 1259, 1449], "score": 0.89, "latex": "Q_{\\mathbb{X}}"}, {"category_id": 13, "poly": [1075, 1682, 1140, 1682, 1140, 1711, 1075, 1711], "score": 0.88, "latex": "a+Y."}, {"category_id": 13, "poly": [895, 1483, 976, 1483, 976, 1512, 895, 1512], "score": 0.88, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 13, "poly": [1206, 1285, 1252, 1285, 1252, 1315, 1206, 1315], "score": 0.88, "latex": "Q_{50}"}, {"category_id": 13, "poly": [779, 1682, 821, 1682, 821, 1714, 779, 1714], "score": 0.88, "latex": "Q_{\\mathrm{\\%}}"}, {"category_id": 13, "poly": [1313, 1649, 1374, 1649, 1374, 1678, 1313, 1678], "score": 0.87, "latex": "T{=}0"}, {"category_id": 14, "poly": [777, 432, 997, 432, 997, 470, 777, 470], "score": 0.83, "latex": "\\begin{array}{r}{Q_{\\%}=f(P)+g(T)}\\end{array}"}, {"category_id": 13, "poly": [963, 1350, 1002, 1350, 1002, 1378, 963, 1378], "score": 0.8, "latex": "\\Delta P"}, {"category_id": 13, "poly": [989, 1318, 1012, 1318, 1012, 1345, 989, 1345], "score": 0.64, "latex": "Y"}, {"category_id": 13, "poly": [1077, 1318, 1098, 1318, 1098, 1345, 1077, 1345], "score": 0.64, "latex": "S"}, {"category_id": 13, "poly": [1239, 1583, 1262, 1583, 1262, 1611, 1239, 1611], "score": 0.51, "latex": "S"}, {"category_id": 13, "poly": [989, 1488, 1008, 1488, 1008, 1511, 989, 1511], "score": 0.3, "latex": "a"}, {"category_id": 15, "poly": [112.0, 651.0, 256.0, 651.0, 256.0, 688.0, 112.0, 688.0], "score": 0.96, "text": "2. Methods"}, {"category_id": 15, "poly": [112.0, 716.0, 526.0, 720.0, 526.0, 757.0, 112.0, 752.0], "score": 0.99, "text": "2.1. Characterisation of fow regime"}, {"category_id": 15, "poly": [778.0, 249.0, 1383.0, 252.0, 1383.0, 288.0, 777.0, 286.0], "score": 0.99, "text": " closure, a time term is required to represent plantation"}, {"category_id": 15, "poly": [777.0, 288.0, 1385.0, 284.0, 1385.0, 320.0, 778.0, 325.0], "score": 0.99, "text": "growth. A simple model relating the time series of"}, {"category_id": 15, "poly": [778.0, 318.0, 1383.0, 323.0, 1383.0, 357.0, 777.0, 353.0], "score": 0.99, "text": "each decile with rainfall and vegetation characteristics"}, {"category_id": 15, "poly": [782.0, 357.0, 1018.0, 357.0, 1018.0, 387.0, 782.0, 387.0], "score": 0.99, "text": "can be expressed as:"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [112.0, 249.0, 722.0, 252.0, 722.0, 288.0, 112.0, 286.0], "score": 0.99, "text": "were to (1) fit a model to the observed annual time"}, {"category_id": 15, "poly": [116.0, 290.0, 719.0, 290.0, 719.0, 320.0, 116.0, 320.0], "score": 0.98, "text": "series of FDC percentiles; i.e. 10th percentile for each"}, {"category_id": 15, "poly": [112.0, 320.0, 722.0, 320.0, 722.0, 357.0, 112.0, 357.0], "score": 0.99, "text": " year of record with annual rainfall and plantation age"}, {"category_id": 15, "poly": [116.0, 355.0, 719.0, 355.0, 719.0, 385.0, 116.0, 385.0], "score": 1.0, "text": "as parameters, (2) replace the annual rainfall variation"}, {"category_id": 15, "poly": [116.0, 389.0, 722.0, 389.0, 722.0, 419.0, 116.0, 419.0], "score": 0.98, "text": "with the long term mean to obtain climate adjusted"}, {"category_id": 15, "poly": [116.0, 421.0, 719.0, 421.0, 719.0, 452.0, 116.0, 452.0], "score": 0.99, "text": "FDCs, and (3) quantify changes in FDC percentiles as"}, {"category_id": 15, "poly": [116.0, 456.0, 717.0, 456.0, 717.0, 486.0, 116.0, 486.0], "score": 1.0, "text": "plantations age. If the climate signal, represented by"}, {"category_id": 15, "poly": [112.0, 482.0, 722.0, 486.0, 721.0, 523.0, 112.0, 518.0], "score": 0.98, "text": "rainfall, could be successfully removed, the resulting"}, {"category_id": 15, "poly": [116.0, 522.0, 719.0, 522.0, 719.0, 553.0, 116.0, 553.0], "score": 0.99, "text": "changes in the FDC would be solely attributable to the"}, {"category_id": 15, "poly": [114.0, 557.0, 243.0, 557.0, 243.0, 587.0, 114.0, 587.0], "score": 1.0, "text": "vegetation."}, {"category_id": 15, "poly": [780.0, 587.0, 1385.0, 587.0, 1385.0, 617.0, 780.0, 617.0], "score": 0.99, "text": "plantation. Annual rainfall was chosen as the rainfall"}, {"category_id": 15, "poly": [780.0, 621.0, 1385.0, 621.0, 1385.0, 649.0, 780.0, 649.0], "score": 0.99, "text": "statistic as it proved to be the most robust predictor of"}, {"category_id": 15, "poly": [780.0, 654.0, 1385.0, 654.0, 1385.0, 684.0, 780.0, 684.0], "score": 0.97, "text": "flow over the whole range of flow percentiles, as"}, {"category_id": 15, "poly": [777.0, 686.0, 1383.0, 686.0, 1383.0, 722.0, 777.0, 722.0], "score": 0.98, "text": " compared with rainfall percentiles; e.g. median rain-"}, {"category_id": 15, "poly": [777.0, 718.0, 1385.0, 718.0, 1385.0, 755.0, 777.0, 755.0], "score": 0.97, "text": "fall versus 10th flow percentile. The use of annual"}, {"category_id": 15, "poly": [775.0, 748.0, 1385.0, 750.0, 1385.0, 787.0, 775.0, 785.0], "score": 0.99, "text": "rainfall also minimises parameter complexity. The"}, {"category_id": 15, "poly": [782.0, 787.0, 1383.0, 787.0, 1383.0, 817.0, 782.0, 817.0], "score": 0.98, "text": "choice of model form is dependent on selecting a"}, {"category_id": 15, "poly": [780.0, 821.0, 1383.0, 821.0, 1383.0, 849.0, 780.0, 849.0], "score": 0.99, "text": "function that describes the relationship between forest"}, {"category_id": 15, "poly": [777.0, 854.0, 1383.0, 851.0, 1383.0, 881.0, 778.0, 884.0], "score": 0.98, "text": "age and ET. Scott and Smith (1997\uff09 demonstrated"}, {"category_id": 15, "poly": [780.0, 886.0, 1383.0, 886.0, 1383.0, 916.0, 780.0, 916.0], "score": 0.98, "text": "cumulative reductions in annual and low flows"}, {"category_id": 15, "poly": [780.0, 920.0, 1383.0, 920.0, 1383.0, 950.0, 780.0, 950.0], "score": 0.98, "text": "resulting from afforestation fitted a sigmoidal"}, {"category_id": 15, "poly": [777.0, 952.0, 1379.0, 952.0, 1379.0, 983.0, 777.0, 983.0], "score": 0.99, "text": "function, similar to forest growth functions. Conse-"}, {"category_id": 15, "poly": [775.0, 985.0, 1385.0, 983.0, 1385.0, 1019.0, 775.0, 1021.0], "score": 0.99, "text": " quently, we used a sigmoidal function to characterise"}, {"category_id": 15, "poly": [780.0, 1019.0, 1381.0, 1019.0, 1381.0, 1049.0, 780.0, 1049.0], "score": 0.99, "text": "the impact of plantation growth on each fow decile."}, {"category_id": 15, "poly": [780.0, 1054.0, 1383.0, 1054.0, 1383.0, 1084.0, 780.0, 1084.0], "score": 0.98, "text": "Fig. 2a is a schematic of the change in the FDC over"}, {"category_id": 15, "poly": [777.0, 1086.0, 1143.0, 1086.0, 1143.0, 1116.0, 777.0, 1116.0], "score": 0.99, "text": "time. The model took the form:"}, {"category_id": 15, "poly": [1202.0, 522.0, 1385.0, 522.0, 1385.0, 550.0, 1202.0, 550.0], "score": 0.99, "text": "is a function of"}, {"category_id": 15, "poly": [782.0, 522.0, 855.0, 522.0, 855.0, 550.0, 782.0, 550.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [898.0, 522.0, 1149.0, 522.0, 1149.0, 550.0, 898.0, 550.0], "score": 0.98, "text": "is the percentile flow,"}, {"category_id": 15, "poly": [780.0, 555.0, 929.0, 555.0, 929.0, 585.0, 780.0, 585.0], "score": 0.95, "text": "rainfall  and"}, {"category_id": 15, "poly": [983.0, 555.0, 1383.0, 555.0, 1383.0, 585.0, 983.0, 585.0], "score": 0.98, "text": " is a function of the age of the"}, {"category_id": 15, "poly": [780.0, 1453.0, 1385.0, 1453.0, 1385.0, 1484.0, 780.0, 1484.0], "score": 1.0, "text": "afforestation has taken place. For the average climate"}, {"category_id": 15, "poly": [775.0, 1516.0, 1383.0, 1516.0, 1383.0, 1546.0, 775.0, 1546.0], "score": 0.98, "text": "the new equilibrium plantation water use under"}, {"category_id": 15, "poly": [777.0, 1552.0, 1385.0, 1552.0, 1385.0, 1582.0, 777.0, 1582.0], "score": 0.99, "text": " afforestation is reached. Y then gives the magnitude"}, {"category_id": 15, "poly": [780.0, 1619.0, 1385.0, 1619.0, 1385.0, 1649.0, 780.0, 1649.0], "score": 0.97, "text": "the shape of the response as shown in Fig. 2b. For"}, {"category_id": 15, "poly": [780.0, 1718.0, 1383.0, 1718.0, 1383.0, 1748.0, 780.0, 1748.0], "score": 0.98, "text": "afforestation condition would not require the time"}, {"category_id": 15, "poly": [780.0, 1752.0, 1383.0, 1752.0, 1383.0, 1782.0, 780.0, 1782.0], "score": 0.98, "text": "term. Details of the optimisation scheme and"}, {"category_id": 15, "poly": [780.0, 1784.0, 1383.0, 1784.0, 1383.0, 1815.0, 780.0, 1815.0], "score": 1.0, "text": "sensitivity tests on initial parameter values are given"}, {"category_id": 15, "poly": [780.0, 1817.0, 1020.0, 1817.0, 1020.0, 1847.0, 780.0, 1847.0], "score": 0.97, "text": "in Lane et al. (2003)."}, {"category_id": 15, "poly": [777.0, 1382.0, 1209.0, 1382.0, 1209.0, 1419.0, 777.0, 1419.0], "score": 0.98, "text": "from the period of record average, and"}, {"category_id": 15, "poly": [1263.0, 1382.0, 1385.0, 1382.0, 1385.0, 1419.0, 1263.0, 1419.0], "score": 0.99, "text": "is the time"}, {"category_id": 15, "poly": [782.0, 1286.0, 856.0, 1286.0, 856.0, 1316.0, 782.0, 1316.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [777.0, 1649.0, 1195.0, 1649.0, 1195.0, 1686.0, 777.0, 1686.0], "score": 1.0, "text": "the average pre-treatment condition"}, {"category_id": 15, "poly": [1312.0, 1486.0, 1385.0, 1486.0, 1385.0, 1516.0, 1312.0, 1516.0], "score": 1.0, "text": "when"}, {"category_id": 15, "poly": [780.0, 1419.0, 1258.0, 1419.0, 1258.0, 1449.0, 780.0, 1449.0], "score": 0.97, "text": "in years at which half of the reduction in"}, {"category_id": 15, "poly": [1302.0, 1419.0, 1385.0, 1419.0, 1385.0, 1449.0, 1302.0, 1449.0], "score": 1.0, "text": "due to"}, {"category_id": 15, "poly": [1141.0, 1686.0, 1379.0, 1686.0, 1379.0, 1716.0, 1141.0, 1716.0], "score": 0.95, "text": " Estimation of a pre-"}, {"category_id": 15, "poly": [780.0, 1486.0, 894.0, 1486.0, 894.0, 1516.0, 780.0, 1516.0], "score": 1.0, "text": "condition"}, {"category_id": 15, "poly": [899.0, 1286.0, 1205.0, 1286.0, 1205.0, 1316.0, 899.0, 1316.0], "score": 0.98, "text": "is the percentile flow (i.e."}, {"category_id": 15, "poly": [1253.0, 1286.0, 1383.0, 1286.0, 1383.0, 1316.0, 1253.0, 1316.0], "score": 1.0, "text": "is the 50th"}, {"category_id": 15, "poly": [822.0, 1686.0, 1074.0, 1686.0, 1074.0, 1716.0, 822.0, 1716.0], "score": 0.99, "text": " approximately equals"}, {"category_id": 15, "poly": [1279.0, 1649.0, 1312.0, 1649.0, 1312.0, 1686.0, 1279.0, 1686.0], "score": 1.0, "text": "at"}, {"category_id": 15, "poly": [777.0, 1352.0, 962.0, 1350.0, 962.0, 1380.0, 778.0, 1382.0], "score": 1.0, "text": "sigmoidal term,"}, {"category_id": 15, "poly": [1003.0, 1352.0, 1385.0, 1350.0, 1385.0, 1380.0, 1003.0, 1382.0], "score": 0.99, "text": "is the deviation of annual rainfall"}, {"category_id": 15, "poly": [775.0, 1316.0, 988.0, 1314.0, 988.0, 1350.0, 775.0, 1352.0], "score": 0.97, "text": "percentile flow),"}, {"category_id": 15, "poly": [1013.0, 1316.0, 1076.0, 1314.0, 1076.0, 1350.0, 1013.0, 1352.0], "score": 0.9, "text": " and"}, {"category_id": 15, "poly": [1099.0, 1316.0, 1385.0, 1314.0, 1385.0, 1350.0, 1099.0, 1352.0], "score": 0.98, "text": " are coefficients of the"}, {"category_id": 15, "poly": [780.0, 1587.0, 1238.0, 1587.0, 1238.0, 1617.0, 780.0, 1617.0], "score": 0.99, "text": "of change due to afforestation, and "}, {"category_id": 15, "poly": [1263.0, 1587.0, 1385.0, 1587.0, 1385.0, 1617.0, 1263.0, 1617.0], "score": 0.99, "text": " describes"}, {"category_id": 15, "poly": [1009.0, 1486.0, 1269.0, 1486.0, 1269.0, 1516.0, 1009.0, 1516.0], "score": 1.0, "text": "becomes the value of"}, {"category_id": 15, "poly": [144.0, 783.0, 720.0, 785.0, 719.0, 821.0, 144.0, 819.0], "score": 0.99, "text": "Flow duration curves display the relationship"}, {"category_id": 15, "poly": [116.0, 821.0, 719.0, 821.0, 719.0, 851.0, 116.0, 851.0], "score": 0.96, "text": "between streamflow and the percentage of time"}, {"category_id": 15, "poly": [116.0, 854.0, 717.0, 854.0, 717.0, 884.0, 116.0, 884.0], "score": 0.98, "text": "the streamflow is exceeded as a cumulative density"}, {"category_id": 15, "poly": [116.0, 888.0, 719.0, 888.0, 719.0, 918.0, 116.0, 918.0], "score": 1.0, "text": "function They can be constructed for any time period"}, {"category_id": 15, "poly": [116.0, 920.0, 715.0, 920.0, 715.0, 950.0, 116.0, 950.0], "score": 0.99, "text": "(daily, weekly, monthly, etc.) and provide a graphical"}, {"category_id": 15, "poly": [114.0, 952.0, 717.0, 955.0, 717.0, 985.0, 114.0, 983.0], "score": 0.99, "text": "and statistical view of historic streamflow variability"}, {"category_id": 15, "poly": [114.0, 987.0, 717.0, 987.0, 717.0, 1017.0, 114.0, 1017.0], "score": 0.99, "text": "in a single catchment or a comparison of inter-"}, {"category_id": 15, "poly": [112.0, 1017.0, 722.0, 1017.0, 722.0, 1054.0, 112.0, 1054.0], "score": 0.99, "text": "catchment flow regimes. Vogel and Fennessey (1994)"}, {"category_id": 15, "poly": [110.0, 1047.0, 722.0, 1049.0, 722.0, 1086.0, 109.0, 1084.0], "score": 0.99, "text": "and Smakhtin (1999, 2001) demonstrate the utility"}, {"category_id": 15, "poly": [114.0, 1088.0, 719.0, 1088.0, 719.0, 1118.0, 114.0, 1118.0], "score": 1.0, "text": "(and caveats) of FDCs in characterising, comparing"}, {"category_id": 15, "poly": [114.0, 1120.0, 722.0, 1120.0, 722.0, 1150.0, 114.0, 1150.0], "score": 0.97, "text": "and predicting flow regimes at varying temporal"}, {"category_id": 15, "poly": [112.0, 1150.0, 724.0, 1150.0, 724.0, 1187.0, 112.0, 1187.0], "score": 0.98, "text": "scales. Fig. 1 is an example of annual FDCs"}, {"category_id": 15, "poly": [114.0, 1187.0, 722.0, 1187.0, 722.0, 1217.0, 114.0, 1217.0], "score": 0.99, "text": "constructed from daily flows. For the consideration"}, {"category_id": 15, "poly": [110.0, 1215.0, 722.0, 1217.0, 722.0, 1253.0, 109.0, 1251.0], "score": 0.99, "text": " of annual flow regime, daily fows are an appropriate"}, {"category_id": 15, "poly": [114.0, 1253.0, 477.0, 1253.0, 477.0, 1284.0, 114.0, 1284.0], "score": 0.99, "text": "time step for FDC construction."}, {"category_id": 15, "poly": [1342.0, 189.0, 1387.0, 189.0, 1387.0, 234.0, 1342.0, 234.0], "score": 1.0, "text": "255"}, {"category_id": 15, "poly": [148.0, 1284.0, 715.0, 1284.0, 715.0, 1314.0, 148.0, 1314.0], "score": 0.99, "text": "FDCs were computed from the distribution of daily"}, {"category_id": 15, "poly": [112.0, 1316.0, 720.0, 1320.0, 719.0, 1350.0, 112.0, 1346.0], "score": 1.0, "text": "flows for each year of record based on the appropriate"}, {"category_id": 15, "poly": [116.0, 1352.0, 719.0, 1352.0, 719.0, 1382.0, 116.0, 1382.0], "score": 0.99, "text": "water years (May-April or November-October) for"}, {"category_id": 15, "poly": [112.0, 1380.0, 722.0, 1382.0, 722.0, 1419.0, 112.0, 1417.0], "score": 0.96, "text": "10 Southern Hemisphere catchments. Each 10th"}, {"category_id": 15, "poly": [114.0, 1419.0, 719.0, 1417.0, 720.0, 1447.0, 114.0, 1449.0], "score": 0.97, "text": "percentile (decile\uff09 was extracted from the annual"}, {"category_id": 15, "poly": [112.0, 1449.0, 720.0, 1451.0, 719.0, 1481.0, 112.0, 1479.0], "score": 0.99, "text": "FDCs of each catchment to form the data sets for"}, {"category_id": 15, "poly": [114.0, 1486.0, 719.0, 1486.0, 719.0, 1516.0, 114.0, 1516.0], "score": 0.99, "text": "analysis. For the purpose of characterising changes in"}, {"category_id": 15, "poly": [114.0, 1518.0, 719.0, 1518.0, 719.0, 1546.0, 114.0, 1546.0], "score": 1.0, "text": "each of the deciles, it is assumed that the time series is"}, {"category_id": 15, "poly": [114.0, 1550.0, 719.0, 1550.0, 719.0, 1580.0, 114.0, 1580.0], "score": 0.96, "text": "principally a function of climate and vegetation"}, {"category_id": 15, "poly": [114.0, 1585.0, 722.0, 1585.0, 722.0, 1615.0, 114.0, 1615.0], "score": 0.99, "text": "characteristics. Given rainfall is generally the most"}, {"category_id": 15, "poly": [114.0, 1619.0, 722.0, 1619.0, 722.0, 1649.0, 114.0, 1649.0], "score": 0.97, "text": "important factor affecting streamflow and the most"}, {"category_id": 15, "poly": [116.0, 1651.0, 719.0, 1651.0, 719.0, 1681.0, 116.0, 1681.0], "score": 0.98, "text": "easily accessed data, it is chosen to represent the"}, {"category_id": 15, "poly": [116.0, 1686.0, 719.0, 1686.0, 719.0, 1716.0, 116.0, 1716.0], "score": 1.0, "text": "climate. Catchment physical properties such as soil"}, {"category_id": 15, "poly": [114.0, 1716.0, 722.0, 1716.0, 722.0, 1752.0, 114.0, 1752.0], "score": 0.98, "text": "properties and topography are assumed to be time"}, {"category_id": 15, "poly": [116.0, 1752.0, 719.0, 1752.0, 719.0, 1782.0, 116.0, 1782.0], "score": 0.99, "text": "invariant and therefore their impact on runoff is"}, {"category_id": 15, "poly": [118.0, 1784.0, 719.0, 1784.0, 719.0, 1815.0, 118.0, 1815.0], "score": 1.0, "text": "considered constant throughout the analysis. As trees"}, {"category_id": 15, "poly": [116.0, 1819.0, 715.0, 1819.0, 715.0, 1849.0, 116.0, 1849.0], "score": 1.0, "text": "intercept and transpire at increasing rates until canopy"}], "page_info": {"page_no": 2, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 4, "poly": [129.72743225097656, 1201.7288818359375, 731.863037109375, 1201.7288818359375, 731.863037109375, 1256.5126953125, 129.72743225097656, 1256.5126953125], "score": 0.9999990463256836}, {"category_id": 1, "poly": [130.2001953125, 1783.3021240234375, 730.197509765625, 1783.3021240234375, 730.197509765625, 1846.7928466796875, 130.2001953125, 1846.7928466796875], "score": 0.9999982714653015}, {"category_id": 0, "poly": [797.18896484375, 501.0386047363281, 1060.7071533203125, 501.0386047363281, 1060.7071533203125, 529.1184692382812, 797.18896484375, 529.1184692382812], "score": 0.9999982714653015}, {"category_id": 2, "poly": [130.7757568359375, 195.0663299560547, 166.40858459472656, 195.0663299560547, 166.40858459472656, 215.67367553710938, 130.7757568359375, 215.67367553710938], "score": 0.9999974966049194}, {"category_id": 9, "poly": [1360.5223388671875, 807.8145751953125, 1393.251953125, 807.8145751953125, 1393.251953125, 835.9564819335938, 1360.5223388671875, 835.9564819335938], "score": 0.9999971389770508}, {"category_id": 3, "poly": [140.5875244140625, 256.1985778808594, 711.4976806640625, 256.1985778808594, 711.4976806640625, 1180.2288818359375, 140.5875244140625, 1180.2288818359375], "score": 0.999996542930603}, {"category_id": 1, "poly": [795.8214721679688, 1244.741455078125, 1393.836181640625, 1244.741455078125, 1393.836181640625, 1508.3616943359375, 795.8214721679688, 1508.3616943359375], "score": 0.9999949932098389}, {"category_id": 2, "poly": [480.60809326171875, 195.57171630859375, 1043.654296875, 195.57171630859375, 1043.654296875, 218.8836212158203, 480.60809326171875, 218.8836212158203], "score": 0.9999940395355225}, {"category_id": 1, "poly": [794.92333984375, 878.8724365234375, 1394.78515625, 878.8724365234375, 1394.78515625, 1241.811279296875, 794.92333984375, 1241.811279296875], "score": 0.9999939799308777}, {"category_id": 8, "poly": [792.0145263671875, 779.4395751953125, 1107.5592041015625, 779.4395751953125, 1107.5592041015625, 865.4520263671875, 792.0145263671875, 865.4520263671875], "score": 0.9999935030937195}, {"category_id": 1, "poly": [794.2274780273438, 567.8933715820312, 1393.5377197265625, 567.8933715820312, 1393.5377197265625, 762.2144775390625, 794.2274780273438, 762.2144775390625], "score": 0.9999918341636658}, {"category_id": 1, "poly": [795.5938110351562, 1715.463134765625, 1394.151611328125, 1715.463134765625, 1394.151611328125, 1845.3857421875, 795.5938110351562, 1845.3857421875], "score": 0.999987781047821}, {"category_id": 1, "poly": [794.4356689453125, 255.30477905273438, 1393.678466796875, 255.30477905273438, 1393.678466796875, 447.8646240234375, 794.4356689453125, 447.8646240234375], "score": 0.9999871253967285}, {"category_id": 1, "poly": [130.53660583496094, 1355.89013671875, 730.9114379882812, 1355.89013671875, 730.9114379882812, 1652.1812744140625, 130.53660583496094, 1652.1812744140625], "score": 0.999987006187439}, {"category_id": 9, "poly": [696.6166381835938, 1699.391845703125, 728.77880859375, 1699.391845703125, 728.77880859375, 1727.2147216796875, 696.6166381835938, 1727.2147216796875], "score": 0.999981164932251}, {"category_id": 9, "poly": [1360.9091796875, 1667.6871337890625, 1393.8095703125, 1667.6871337890625, 1393.8095703125, 1699.094482421875, 1360.9091796875, 1699.094482421875], "score": 0.9999788999557495}, {"category_id": 8, "poly": [790.2078857421875, 1522.67236328125, 1111.4049072265625, 1522.67236328125, 1111.4049072265625, 1604.606689453125, 790.2078857421875, 1604.606689453125], "score": 0.9999706149101257}, {"category_id": 9, "poly": [1361.0799560546875, 1545.7677001953125, 1393.7020263671875, 1545.7677001953125, 1393.7020263671875, 1573.452392578125, 1361.0799560546875, 1573.452392578125], "score": 0.9998459815979004}, {"category_id": 8, "poly": [127.09381866455078, 1678.0965576171875, 565.4200439453125, 1678.0965576171875, 565.4200439453125, 1756.1007080078125, 127.09381866455078, 1756.1007080078125], "score": 0.9997967481613159}, {"category_id": 8, "poly": [794.1704711914062, 1666.248779296875, 974.3306274414062, 1666.248779296875, 974.3306274414062, 1700.88720703125, 794.1704711914062, 1700.88720703125], "score": 0.9997556209564209}, {"category_id": 0, "poly": [131.9687042236328, 1288.984375, 435.8473205566406, 1288.984375, 435.8473205566406, 1316.791259765625, 131.9687042236328, 1316.791259765625], "score": 0.9995421767234802}, {"category_id": 1, "poly": [794.0263671875, 1622.5870361328125, 839.6729125976562, 1622.5870361328125, 839.6729125976562, 1647.691650390625, 794.0263671875, 1647.691650390625], "score": 0.9984337687492371}, {"category_id": 14, "poly": [790, 777, 1108, 777, 1108, 863, 790, 863], "score": 0.94, "latex": "E=1.0-\\frac{\\sum_{i=1}^{N}(O_{i}-P_{i})^{2}}{\\sum_{i=1}^{N}(O_{i}-\\bar{O})^{2}}"}, {"category_id": 14, "poly": [790, 1521, 1110, 1521, 1110, 1602, 790, 1602], "score": 0.94, "latex": "Q_{\\mathcal{Q}}=a+\\frac{Y}{1+\\exp\\left(\\frac{T-T_{\\mathrm{half}}}{S}\\right)}"}, {"category_id": 14, "poly": [125, 1674, 566, 1674, 566, 1756, 125, 1756], "score": 0.93, "latex": "N_{\\mathrm{zero}}=a+b(\\Delta P)+\\frac{Y}{1+\\exp\\left(\\frac{T-T_{\\mathrm{half}}}{S}\\right)}"}, {"category_id": 13, "poly": [1306, 319, 1388, 319, 1388, 349, 1306, 349], "score": 0.91, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 13, "poly": [529, 1555, 589, 1555, 589, 1585, 529, 1585], "score": 0.9, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [1281, 1176, 1365, 1176, 1365, 1205, 1281, 1205], "score": 0.9, "latex": "E\\!>\\!0.7"}, {"category_id": 13, "poly": [880, 1173, 931, 1173, 931, 1206, 880, 1206], "score": 0.89, "latex": "<\\!r^{2}"}, {"category_id": 13, "poly": [873, 1409, 932, 1409, 932, 1438, 873, 1438], "score": 0.89, "latex": "b\\!=\\!0"}, {"category_id": 13, "poly": [597, 1522, 656, 1522, 656, 1552, 597, 1552], "score": 0.89, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [792, 353, 856, 353, 856, 382, 792, 382], "score": 0.88, "latex": "a+Y"}, {"category_id": 13, "poly": [649, 1782, 731, 1782, 731, 1810, 649, 1810], "score": 0.88, "latex": "\\Delta P\\!=\\!0"}, {"category_id": 14, "poly": [791, 1663, 976, 1663, 976, 1699, 791, 1699], "score": 0.88, "latex": "Q_{\\%}=a+b\\Delta P"}, {"category_id": 13, "poly": [1199, 1409, 1259, 1409, 1259, 1438, 1199, 1438], "score": 0.87, "latex": "Y{=}\\,0"}, {"category_id": 13, "poly": [513, 1487, 585, 1487, 585, 1519, 513, 1519], "score": 0.85, "latex": "(N_{\\mathrm{zero}})"}, {"category_id": 13, "poly": [1335, 1073, 1362, 1073, 1362, 1104, 1335, 1104], "score": 0.84, "latex": "r^{2}"}, {"category_id": 13, "poly": [845, 908, 869, 908, 869, 938, 845, 938], "score": 0.81, "latex": "\\bar{O}"}, {"category_id": 13, "poly": [1123, 880, 1146, 880, 1146, 905, 1123, 905], "score": 0.79, "latex": "P"}, {"category_id": 13, "poly": [1344, 1145, 1367, 1145, 1367, 1171, 1344, 1171], "score": 0.79, "latex": "E"}, {"category_id": 13, "poly": [872, 879, 896, 879, 896, 905, 872, 905], "score": 0.77, "latex": "o"}, {"category_id": 13, "poly": [713, 1521, 731, 1521, 731, 1548, 713, 1548], "score": 0.76, "latex": "b"}, {"category_id": 13, "poly": [1274, 912, 1298, 912, 1298, 938, 1274, 938], "score": 0.76, "latex": "E"}, {"category_id": 13, "poly": [1347, 699, 1369, 699, 1369, 726, 1347, 726], "score": 0.75, "latex": "E"}, {"category_id": 13, "poly": [263, 1815, 326, 1815, 326, 1847, 263, 1847], "score": 0.74, "latex": "N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [185, 1814, 245, 1814, 245, 1845, 185, 1845], "score": 0.73, "latex": "T\\!\\!=\\!\\!0"}, {"category_id": 13, "poly": [1010, 1819, 1023, 1819, 1023, 1842, 1010, 1842], "score": 0.7, "latex": "t"}, {"category_id": 13, "poly": [1207, 565, 1246, 565, 1246, 596, 1207, 596], "score": 0.67, "latex": "(E)"}, {"category_id": 13, "poly": [1310, 979, 1364, 979, 1364, 1007, 1310, 1007], "score": 0.64, "latex": "-\\infty"}, {"category_id": 13, "poly": [1031, 1754, 1044, 1754, 1044, 1776, 1031, 1776], "score": 0.57, "latex": "t\\cdot"}, {"category_id": 13, "poly": [1313, 1818, 1326, 1818, 1326, 1842, 1313, 1842], "score": 0.57, "latex": "t\\cdot"}, {"category_id": 13, "poly": [960, 1073, 1001, 1073, 1001, 1108, 960, 1108], "score": 0.55, "latex": "(r^{2})"}, {"category_id": 13, "poly": [175, 1555, 194, 1555, 194, 1582, 175, 1582], "score": 0.47, "latex": "S"}, {"category_id": 13, "poly": [1020, 287, 1043, 287, 1043, 315, 1020, 315], "score": 0.38, "latex": "S"}, {"category_id": 13, "poly": [1016, 1076, 1040, 1076, 1040, 1105, 1016, 1105], "score": 0.36, "latex": "E"}, {"category_id": 13, "poly": [599, 1815, 660, 1815, 660, 1845, 599, 1845], "score": 0.35, "latex": "a,~Y"}, {"category_id": 13, "poly": [637, 1816, 660, 1816, 660, 1843, 637, 1843], "score": 0.32, "latex": "Y"}, {"category_id": 13, "poly": [184, 1814, 324, 1814, 324, 1847, 184, 1847], "score": 0.27, "latex": "T\\!\\!=\\!0,\\ N_{\\mathrm{zero}}"}, {"category_id": 15, "poly": [131.0, 1204.0, 732.0, 1204.0, 732.0, 1232.0, 131.0, 1232.0], "score": 1.0, "text": "Fig. 2. (a) Schematic of the change in the FDC over time, and"}, {"category_id": 15, "poly": [129.0, 1227.0, 447.0, 1232.0, 446.0, 1260.0, 129.0, 1255.0], "score": 0.98, "text": "(b) definition of model parameters."}, {"category_id": 15, "poly": [159.0, 1778.0, 648.0, 1778.0, 648.0, 1821.0, 159.0, 1821.0], "score": 0.99, "text": "For the average pre-treatment condition "}, {"category_id": 15, "poly": [327.0, 1819.0, 598.0, 1819.0, 598.0, 1849.0, 327.0, 1849.0], "score": 0.98, "text": " approximately equals"}, {"category_id": 15, "poly": [661.0, 1819.0, 728.0, 1819.0, 728.0, 1849.0, 661.0, 1849.0], "score": 1.0, "text": "gives"}, {"category_id": 15, "poly": [129.0, 1819.0, 183.0, 1819.0, 183.0, 1849.0, 129.0, 1849.0], "score": 0.88, "text": "and "}, {"category_id": 15, "poly": [793.0, 499.0, 1065.0, 499.0, 1065.0, 535.0, 793.0, 535.0], "score": 0.98, "text": "2.3. Statistical analyses"}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "256"}, {"category_id": 15, "poly": [825.0, 1245.0, 1396.0, 1245.0, 1396.0, 1275.0, 825.0, 1275.0], "score": 0.98, "text": "It is important to assess the significance of the"}, {"category_id": 15, "poly": [790.0, 1279.0, 1396.0, 1279.0, 1396.0, 1309.0, 790.0, 1309.0], "score": 0.97, "text": "model parameters to check the model assumptions"}, {"category_id": 15, "poly": [788.0, 1307.0, 1400.0, 1309.0, 1400.0, 1346.0, 788.0, 1344.0], "score": 1.0, "text": "that rainfall and forest age are driving changes in the"}, {"category_id": 15, "poly": [790.0, 1346.0, 1396.0, 1346.0, 1396.0, 1376.0, 790.0, 1376.0], "score": 0.99, "text": "FDC. The model (2) was split into simplified forms,"}, {"category_id": 15, "poly": [793.0, 1378.0, 1396.0, 1378.0, 1396.0, 1408.0, 793.0, 1408.0], "score": 1.0, "text": "where only the rainfall or time terms were included by"}, {"category_id": 15, "poly": [793.0, 1445.0, 1398.0, 1445.0, 1398.0, 1475.0, 793.0, 1475.0], "score": 0.99, "text": "Eq. (6). The component models (5) and (6) were then"}, {"category_id": 15, "poly": [790.0, 1477.0, 1233.0, 1477.0, 1233.0, 1507.0, 790.0, 1507.0], "score": 1.0, "text": "tested against the complete model, (2)."}, {"category_id": 15, "poly": [790.0, 1408.0, 872.0, 1408.0, 872.0, 1445.0, 790.0, 1445.0], "score": 0.99, "text": "setting"}, {"category_id": 15, "poly": [933.0, 1408.0, 1198.0, 1408.0, 1198.0, 1445.0, 933.0, 1445.0], "score": 0.99, "text": ", as shown in Eq. (5), or"}, {"category_id": 15, "poly": [1260.0, 1408.0, 1400.0, 1408.0, 1400.0, 1445.0, 1260.0, 1445.0], "score": 0.97, "text": "as shown in"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [790.0, 944.0, 1400.0, 944.0, 1400.0, 980.0, 790.0, 980.0], "score": 1.0, "text": "minus the ratio of the mean square error to the"}, {"category_id": 15, "poly": [790.0, 1008.0, 1400.0, 1011.0, 1400.0, 1047.0, 790.0, 1045.0], "score": 0.98, "text": "1.0. Higher values indicate greater agreement between"}, {"category_id": 15, "poly": [788.0, 1041.0, 1403.0, 1043.0, 1402.0, 1079.0, 788.0, 1077.0], "score": 0.97, "text": " observed and predicted data as per the coefficient of "}, {"category_id": 15, "poly": [793.0, 1112.0, 1400.0, 1112.0, 1400.0, 1142.0, 793.0, 1142.0], "score": 0.97, "text": "evaluating hydrologic modelling because it is a"}, {"category_id": 15, "poly": [790.0, 1213.0, 1121.0, 1213.0, 1121.0, 1241.0, 790.0, 1241.0], "score": 0.99, "text": "indicate adequate model fits."}, {"category_id": 15, "poly": [1366.0, 1178.0, 1398.0, 1178.0, 1398.0, 1208.0, 1366.0, 1208.0], "score": 1.0, "text": "to"}, {"category_id": 15, "poly": [793.0, 1178.0, 879.0, 1178.0, 879.0, 1208.0, 793.0, 1208.0], "score": 1.0, "text": "always"}, {"category_id": 15, "poly": [932.0, 1178.0, 1280.0, 1178.0, 1280.0, 1208.0, 932.0, 1208.0], "score": 1.0, "text": "we have arbitrarily considered"}, {"category_id": 15, "poly": [1363.0, 1075.0, 1402.0, 1075.0, 1402.0, 1112.0, 1363.0, 1112.0], "score": 1.0, "text": "in"}, {"category_id": 15, "poly": [788.0, 909.0, 844.0, 909.0, 844.0, 946.0, 788.0, 946.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [1147.0, 875.0, 1398.0, 877.0, 1398.0, 914.0, 1147.0, 912.0], "score": 0.97, "text": " are predicted values,"}, {"category_id": 15, "poly": [793.0, 1146.0, 1343.0, 1146.0, 1343.0, 1176.0, 793.0, 1176.0], "score": 1.0, "text": "measure of the deviation from the 1:1 line. As"}, {"category_id": 15, "poly": [1368.0, 1146.0, 1398.0, 1146.0, 1398.0, 1176.0, 1368.0, 1176.0], "score": 1.0, "text": "is"}, {"category_id": 15, "poly": [788.0, 875.0, 871.0, 877.0, 871.0, 914.0, 788.0, 912.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [897.0, 875.0, 1122.0, 877.0, 1122.0, 914.0, 897.0, 912.0], "score": 1.0, "text": "are observed data,"}, {"category_id": 15, "poly": [870.0, 909.0, 1273.0, 909.0, 1273.0, 946.0, 870.0, 946.0], "score": 0.97, "text": " is the mean for the entire period."}, {"category_id": 15, "poly": [1299.0, 909.0, 1398.0, 909.0, 1398.0, 946.0, 1299.0, 946.0], "score": 1.0, "text": "is unity"}, {"category_id": 15, "poly": [793.0, 980.0, 1309.0, 980.0, 1309.0, 1010.0, 793.0, 1010.0], "score": 1.0, "text": "variance in the observed data, and ranges from"}, {"category_id": 15, "poly": [1365.0, 980.0, 1398.0, 980.0, 1398.0, 1010.0, 1365.0, 1010.0], "score": 1.0, "text": "to"}, {"category_id": 15, "poly": [790.0, 1075.0, 959.0, 1075.0, 959.0, 1112.0, 790.0, 1112.0], "score": 1.0, "text": "determination"}, {"category_id": 15, "poly": [1041.0, 1075.0, 1334.0, 1075.0, 1334.0, 1112.0, 1041.0, 1112.0], "score": 0.98, "text": "is used in preference to"}, {"category_id": 15, "poly": [790.0, 596.0, 1398.0, 598.0, 1398.0, 634.0, 790.0, 632.0], "score": 1.0, "text": "Sutcliffe, 1970; Chiew and McMahon, 1993; Legates"}, {"category_id": 15, "poly": [788.0, 628.0, 1396.0, 632.0, 1396.0, 667.0, 788.0, 662.0], "score": 0.96, "text": " and McCabe, 1999) was used as the ^goodness of fit\u2019"}, {"category_id": 15, "poly": [793.0, 669.0, 1398.0, 669.0, 1398.0, 697.0, 793.0, 697.0], "score": 0.99, "text": "measure to evaluate the fit between observed and"}, {"category_id": 15, "poly": [790.0, 736.0, 905.0, 731.0, 907.0, 763.0, 791.0, 768.0], "score": 0.98, "text": "given by:"}, {"category_id": 15, "poly": [790.0, 701.0, 1346.0, 699.0, 1346.0, 729.0, 790.0, 731.0], "score": 0.99, "text": "predicted flow deciles (2) and zero flow days (3)."}, {"category_id": 15, "poly": [1370.0, 701.0, 1398.0, 699.0, 1398.0, 729.0, 1370.0, 731.0], "score": 1.0, "text": "is"}, {"category_id": 15, "poly": [827.0, 568.0, 1206.0, 568.0, 1206.0, 598.0, 827.0, 598.0], "score": 0.95, "text": "The  coefficient  of efficiency"}, {"category_id": 15, "poly": [1247.0, 568.0, 1398.0, 568.0, 1398.0, 598.0, 1247.0, 598.0], "score": 0.97, "text": "(Nash and"}, {"category_id": 15, "poly": [825.0, 1716.0, 1394.0, 1716.0, 1394.0, 1752.0, 825.0, 1752.0], "score": 0.99, "text": "For both the fow duration curve analysis and zero"}, {"category_id": 15, "poly": [795.0, 1784.0, 1392.0, 1784.0, 1392.0, 1815.0, 795.0, 1815.0], "score": 0.98, "text": "whether (5) and (6) were significantly different to (2)."}, {"category_id": 15, "poly": [790.0, 1812.0, 1009.0, 1815.0, 1009.0, 1851.0, 790.0, 1849.0], "score": 0.99, "text": "A critical value of"}, {"category_id": 15, "poly": [790.0, 1750.0, 1030.0, 1752.0, 1030.0, 1782.0, 790.0, 1780.0], "score": 1.0, "text": "flow days analysis, a"}, {"category_id": 15, "poly": [1045.0, 1750.0, 1394.0, 1752.0, 1394.0, 1782.0, 1045.0, 1780.0], "score": 0.98, "text": "-test was then performed to test"}, {"category_id": 15, "poly": [1024.0, 1812.0, 1312.0, 1815.0, 1312.0, 1851.0, 1024.0, 1849.0], "score": 1.0, "text": "exceeding the calculated"}, {"category_id": 15, "poly": [1327.0, 1812.0, 1396.0, 1815.0, 1396.0, 1851.0, 1327.0, 1849.0], "score": 1.0, "text": "value"}, {"category_id": 15, "poly": [795.0, 256.0, 1398.0, 256.0, 1398.0, 286.0, 795.0, 286.0], "score": 0.97, "text": "the magnitude of change in zero flow days due to"}, {"category_id": 15, "poly": [790.0, 389.0, 1398.0, 389.0, 1398.0, 419.0, 790.0, 419.0], "score": 0.96, "text": "new  equilibrium  condition under  afforestation  is"}, {"category_id": 15, "poly": [790.0, 421.0, 891.0, 421.0, 891.0, 452.0, 790.0, 452.0], "score": 1.0, "text": "reached."}, {"category_id": 15, "poly": [793.0, 322.0, 1305.0, 322.0, 1305.0, 353.0, 793.0, 353.0], "score": 1.0, "text": "response. For the average climate condition"}, {"category_id": 15, "poly": [857.0, 355.0, 1398.0, 355.0, 1398.0, 385.0, 857.0, 385.0], "score": 0.99, "text": "becomes the number of zero flow days when the"}, {"category_id": 15, "poly": [793.0, 290.0, 1019.0, 290.0, 1019.0, 320.0, 793.0, 320.0], "score": 0.98, "text": "afforestation, and"}, {"category_id": 15, "poly": [1044.0, 290.0, 1398.0, 290.0, 1398.0, 320.0, 1044.0, 320.0], "score": 0.95, "text": " describes the shape of the"}, {"category_id": 15, "poly": [157.0, 1350.0, 732.0, 1352.0, 732.0, 1389.0, 157.0, 1387.0], "score": 0.98, "text": " A notable feature of Fig. 1 is the increase in the"}, {"category_id": 15, "poly": [127.0, 1389.0, 735.0, 1389.0, 735.0, 1425.0, 127.0, 1425.0], "score": 0.99, "text": "number of zero fow days. A similar approach to"}, {"category_id": 15, "poly": [129.0, 1423.0, 735.0, 1423.0, 735.0, 1453.0, 129.0, 1453.0], "score": 0.98, "text": "Eq. (2), using an inverse sigmoidal function was"}, {"category_id": 15, "poly": [129.0, 1456.0, 732.0, 1456.0, 732.0, 1486.0, 129.0, 1486.0], "score": 0.98, "text": "employed to assess the impact of afforestation on the"}, {"category_id": 15, "poly": [129.0, 1589.0, 735.0, 1589.0, 735.0, 1619.0, 129.0, 1619.0], "score": 0.99, "text": "rainfall increases, and increases with plantation"}, {"category_id": 15, "poly": [126.0, 1624.0, 220.0, 1618.0, 222.0, 1651.0, 128.0, 1656.0], "score": 1.0, "text": "growth:"}, {"category_id": 15, "poly": [590.0, 1557.0, 732.0, 1557.0, 732.0, 1587.0, 590.0, 1587.0], "score": 1.0, "text": "decreases as"}, {"category_id": 15, "poly": [129.0, 1524.0, 596.0, 1524.0, 596.0, 1554.0, 129.0, 1554.0], "score": 0.98, "text": "the left hand side of Eq. (2) is replaced by"}, {"category_id": 15, "poly": [129.0, 1490.0, 512.0, 1490.0, 512.0, 1520.0, 129.0, 1520.0], "score": 0.99, "text": "number of zero flow days per year"}, {"category_id": 15, "poly": [586.0, 1490.0, 732.0, 1490.0, 732.0, 1520.0, 586.0, 1520.0], "score": 0.97, "text": ". In this case,"}, {"category_id": 15, "poly": [657.0, 1524.0, 712.0, 1524.0, 712.0, 1554.0, 657.0, 1554.0], "score": 0.97, "text": ", and"}, {"category_id": 15, "poly": [129.0, 1557.0, 174.0, 1557.0, 174.0, 1587.0, 129.0, 1587.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [195.0, 1557.0, 528.0, 1557.0, 528.0, 1587.0, 195.0, 1587.0], "score": 0.99, "text": "are constrained to negative as"}, {"category_id": 15, "poly": [129.0, 1288.0, 438.0, 1288.0, 438.0, 1324.0, 129.0, 1324.0], "score": 0.99, "text": "2.2. Zero fow day analysis"}, {"category_id": 15, "poly": [788.0, 1617.0, 844.0, 1617.0, 844.0, 1662.0, 788.0, 1662.0], "score": 1.0, "text": "and"}], "page_info": {"page_no": 3, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 1, "poly": [780.4981079101562, 951.0537109375, 1382.5201416015625, 951.0537109375, 1382.5201416015625, 1648.58154296875, 780.4981079101562, 1648.58154296875], "score": 0.9999959468841553}, {"category_id": 2, "poly": [466.9576110839844, 194.6658935546875, 1030.6968994140625, 194.6658935546875, 1030.6968994140625, 219.20504760742188, 466.9576110839844, 219.20504760742188], "score": 0.9999955892562866}, {"category_id": 0, "poly": [782.29931640625, 886.77197265625, 919.8917236328125, 886.77197265625, 919.8917236328125, 915.8782348632812, 782.29931640625, 915.8782348632812], "score": 0.9999911189079285}, {"category_id": 1, "poly": [782.0343017578125, 253.53207397460938, 1382.2440185546875, 253.53207397460938, 1382.2440185546875, 350.4256896972656, 782.0343017578125, 350.4256896972656], "score": 0.9999889731407166}, {"category_id": 1, "poly": [781.621826171875, 653.5359497070312, 1381.272705078125, 653.5359497070312, 1381.272705078125, 783.18798828125, 781.621826171875, 783.18798828125], "score": 0.9999889731407166}, {"category_id": 5, "poly": [182.05813598632812, 248.86912536621094, 650.3305053710938, 248.86912536621094, 650.3305053710938, 1845.613037109375, 182.05813598632812, 1845.613037109375], "score": 0.9999887347221375}, {"category_id": 1, "poly": [781.0881958007812, 1650.3038330078125, 1382.088134765625, 1650.3038330078125, 1382.088134765625, 1848.214111328125, 781.0881958007812, 1848.214111328125], "score": 0.9999865293502808}, {"category_id": 2, "poly": [1346.05322265625, 194.5203399658203, 1381.46875, 194.5203399658203, 1381.46875, 216.90557861328125, 1346.05322265625, 216.90557861328125], "score": 0.9999804496765137}, {"category_id": 8, "poly": [779.12451171875, 544.6279296875, 1165.58349609375, 544.6279296875, 1165.58349609375, 623.5341796875, 779.12451171875, 623.5341796875], "score": 0.9999717473983765}, {"category_id": 1, "poly": [781.6971435546875, 352.1080017089844, 1382.5953369140625, 352.1080017089844, 1382.5953369140625, 515.912109375, 781.6971435546875, 515.912109375], "score": 0.999969482421875}, {"category_id": 9, "poly": [1347.20849609375, 571.1251831054688, 1380.7503662109375, 571.1251831054688, 1380.7503662109375, 601.0969848632812, 1347.20849609375, 601.0969848632812], "score": 0.9999024868011475}, {"category_id": 7, "poly": [659.8250732421875, 882.5633544921875, 686.8219604492188, 882.5633544921875, 686.8219604492188, 1842.583251953125, 659.8250732421875, 1842.583251953125], "score": 0.9764553904533386}, {"category_id": 6, "poly": [112.29073333740234, 1497.288330078125, 169.8206329345703, 1497.288330078125, 169.8206329345703, 1843.9019775390625, 112.29073333740234, 1843.9019775390625], "score": 0.8885180950164795}, {"category_id": 14, "poly": [776, 546, 1164, 546, 1164, 622, 776, 622], "score": 0.91, "latex": "F=\\frac{[(\\mathrm{SSE_{s}-S S E_{c}})/(\\mathrm{df_{c}-d f_{s}})]}{\\mathrm{SSE_{c}/d f_{c}}}"}, {"category_id": 13, "poly": [1087, 415, 1135, 415, 1135, 447, 1087, 447], "score": 0.88, "latex": "F^{0.5}"}, {"category_id": 13, "poly": [1155, 1183, 1223, 1183, 1223, 1214, 1155, 1214], "score": 0.86, "latex": "100\\%"}, {"category_id": 13, "poly": [779, 1781, 820, 1781, 820, 1812, 779, 1812], "score": 0.82, "latex": "6\\%"}, {"category_id": 13, "poly": [831, 487, 852, 487, 852, 513, 831, 513], "score": 0.77, "latex": "F"}, {"category_id": 13, "poly": [1120, 390, 1133, 390, 1133, 413, 1120, 413], "score": 0.72, "latex": "t\\cdot"}, {"category_id": 13, "poly": [780, 423, 792, 423, 792, 446, 780, 446], "score": 0.49, "latex": "t\\cdot"}, {"category_id": 13, "poly": [1074, 1716, 1095, 1716, 1095, 1742, 1074, 1742], "score": 0.31, "latex": "P"}, {"category_id": 15, "poly": [814.0, 952.0, 1383.0, 952.0, 1383.0, 983.0, 814.0, 983.0], "score": 0.98, "text": "Daily streamflow data were obtained from 10"}, {"category_id": 15, "poly": [782.0, 987.0, 1383.0, 987.0, 1383.0, 1017.0, 782.0, 1017.0], "score": 0.99, "text": "catchment studies from southeastern Australia, New"}, {"category_id": 15, "poly": [780.0, 1019.0, 1383.0, 1019.0, 1383.0, 1049.0, 780.0, 1049.0], "score": 0.99, "text": "Zealand and South Africa. The initial criteria for"}, {"category_id": 15, "poly": [775.0, 1047.0, 1383.0, 1051.0, 1383.0, 1088.0, 775.0, 1084.0], "score": 0.99, "text": "selection of these catchments were a known veg-"}, {"category_id": 15, "poly": [780.0, 1088.0, 1385.0, 1088.0, 1385.0, 1118.0, 780.0, 1118.0], "score": 0.95, "text": "etation history and streamflow records of good"}, {"category_id": 15, "poly": [780.0, 1120.0, 1381.0, 1120.0, 1381.0, 1150.0, 780.0, 1150.0], "score": 0.99, "text": "quality. The ideal data sets were those with a lengthy"}, {"category_id": 15, "poly": [777.0, 1155.0, 1385.0, 1152.0, 1385.0, 1182.0, 778.0, 1185.0], "score": 0.97, "text": "pre- and post-treatment (plantation establishment)"}, {"category_id": 15, "poly": [775.0, 1215.0, 1387.0, 1217.0, 1387.0, 1253.0, 775.0, 1251.0], "score": 0.99, "text": " ment converted from grassland or a crop equivalent to"}, {"category_id": 15, "poly": [780.0, 1253.0, 1385.0, 1253.0, 1385.0, 1284.0, 780.0, 1284.0], "score": 0.99, "text": "plantation. In reality, all these criteria were not easy to"}, {"category_id": 15, "poly": [782.0, 1286.0, 1383.0, 1286.0, 1383.0, 1316.0, 782.0, 1316.0], "score": 0.99, "text": "satisfy. For example in Victoria, Australia, the best"}, {"category_id": 15, "poly": [780.0, 1320.0, 1385.0, 1320.0, 1385.0, 1348.0, 780.0, 1348.0], "score": 0.99, "text": "data is from Stewarts Creek, a set of decommissioned"}, {"category_id": 15, "poly": [780.0, 1352.0, 1385.0, 1352.0, 1385.0, 1382.0, 780.0, 1382.0], "score": 0.99, "text": "research catchments with 9 years of pre-treatment"}, {"category_id": 15, "poly": [780.0, 1387.0, 1383.0, 1387.0, 1383.0, 1417.0, 780.0, 1417.0], "score": 1.0, "text": "data and 25 years of post-treatment. Here, though, the"}, {"category_id": 15, "poly": [778.0, 1417.0, 1383.0, 1419.0, 1383.0, 1449.0, 777.0, 1447.0], "score": 0.98, "text": "treatment was a conversion from native eucalypt"}, {"category_id": 15, "poly": [780.0, 1453.0, 1385.0, 1453.0, 1385.0, 1484.0, 780.0, 1484.0], "score": 0.99, "text": "forest to pine. The assumption made for this data set is"}, {"category_id": 15, "poly": [780.0, 1486.0, 1383.0, 1486.0, 1383.0, 1516.0, 780.0, 1516.0], "score": 0.98, "text": "that the immediate post-treatment period may be"}, {"category_id": 15, "poly": [780.0, 1518.0, 1383.0, 1518.0, 1383.0, 1548.0, 780.0, 1548.0], "score": 0.99, "text": "viewed as a non-forested condition. This condition is"}, {"category_id": 15, "poly": [777.0, 1552.0, 1385.0, 1552.0, 1385.0, 1582.0, 777.0, 1582.0], "score": 0.99, "text": "likely to approximate the ET conditions of pasture or"}, {"category_id": 15, "poly": [780.0, 1587.0, 1385.0, 1587.0, 1385.0, 1617.0, 780.0, 1617.0], "score": 0.98, "text": "short crops for up to 3 years. Catchment details and"}, {"category_id": 15, "poly": [775.0, 1619.0, 1145.0, 1617.0, 1145.0, 1647.0, 775.0, 1649.0], "score": 1.0, "text": "treatments are given in Table 1."}, {"category_id": 15, "poly": [780.0, 1187.0, 1154.0, 1187.0, 1154.0, 1217.0, 780.0, 1217.0], "score": 0.98, "text": "flow record with approximately"}, {"category_id": 15, "poly": [1224.0, 1187.0, 1381.0, 1187.0, 1381.0, 1217.0, 1224.0, 1217.0], "score": 0.99, "text": "of the catch-"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [776.0, 881.0, 924.0, 886.0, 923.0, 925.0, 775.0, 920.0], "score": 0.96, "text": " 3. Data sets"}, {"category_id": 15, "poly": [782.0, 254.0, 1383.0, 254.0, 1383.0, 290.0, 782.0, 290.0], "score": 0.99, "text": "when comparing (5) and (2) would indicate the time"}, {"category_id": 15, "poly": [782.0, 290.0, 1381.0, 290.0, 1381.0, 320.0, 782.0, 320.0], "score": 1.0, "text": "term in (6) was required to improve the complete"}, {"category_id": 15, "poly": [778.0, 318.0, 1347.0, 320.0, 1347.0, 357.0, 777.0, 355.0], "score": 1.0, "text": "model and is therefore significant, and vice versa."}, {"category_id": 15, "poly": [782.0, 656.0, 1385.0, 656.0, 1385.0, 686.0, 782.0, 686.0], "score": 1.0, "text": "where SSE is the residual sum of the squared errors, df"}, {"category_id": 15, "poly": [780.0, 686.0, 1383.0, 686.0, 1383.0, 722.0, 780.0, 722.0], "score": 0.99, "text": "is degrees of freedom, and the subscripts s and c refer"}, {"category_id": 15, "poly": [778.0, 720.0, 1379.0, 722.0, 1379.0, 753.0, 777.0, 750.0], "score": 0.98, "text": "to the simplified model and complete models,"}, {"category_id": 15, "poly": [777.0, 752.0, 926.0, 752.0, 926.0, 789.0, 777.0, 789.0], "score": 0.97, "text": "respectively."}, {"category_id": 15, "poly": [816.0, 1651.0, 1381.0, 1651.0, 1381.0, 1681.0, 816.0, 1681.0], "score": 0.99, "text": "All catchments, with the exception of Traralgon"}, {"category_id": 15, "poly": [782.0, 1686.0, 1379.0, 1686.0, 1379.0, 1716.0, 782.0, 1716.0], "score": 1.0, "text": "Creek, were afforested with pine species, predomi-"}, {"category_id": 15, "poly": [782.0, 1752.0, 1379.0, 1752.0, 1379.0, 1782.0, 782.0, 1782.0], "score": 1.0, "text": "Cathedral Peak catchments. Traralgon Creek has only"}, {"category_id": 15, "poly": [776.0, 1812.0, 1147.0, 1817.0, 1147.0, 1851.0, 775.0, 1847.0], "score": 0.99, "text": "of which is Eucalyptus regnans."}, {"category_id": 15, "poly": [821.0, 1784.0, 1381.0, 1784.0, 1381.0, 1815.0, 821.0, 1815.0], "score": 0.99, "text": " pine, with the remainder eucalypts species, most"}, {"category_id": 15, "poly": [780.0, 1718.0, 1073.0, 1718.0, 1073.0, 1748.0, 780.0, 1748.0], "score": 0.97, "text": "nantly Pinus radiata, with"}, {"category_id": 15, "poly": [1096.0, 1718.0, 1381.0, 1718.0, 1381.0, 1748.0, 1096.0, 1748.0], "score": 0.96, "text": "patula planted at the two"}, {"category_id": 15, "poly": [1340.0, 189.0, 1387.0, 189.0, 1387.0, 239.0, 1340.0, 239.0], "score": 1.0, "text": "257"}, {"category_id": 15, "poly": [814.0, 355.0, 1383.0, 355.0, 1383.0, 385.0, 814.0, 385.0], "score": 0.98, "text": "Due to the constraint that the rainfall and time term"}, {"category_id": 15, "poly": [780.0, 456.0, 1381.0, 456.0, 1381.0, 486.0, 780.0, 486.0], "score": 0.97, "text": "the critical value for significance at the 0.05 level."}, {"category_id": 15, "poly": [1136.0, 415.0, 1383.0, 417.0, 1383.0, 454.0, 1136.0, 451.0], "score": 0.98, "text": ", and compared with"}, {"category_id": 15, "poly": [780.0, 486.0, 830.0, 488.0, 830.0, 518.0, 780.0, 516.0], "score": 1.0, "text": "The"}, {"category_id": 15, "poly": [853.0, 486.0, 1160.0, 488.0, 1160.0, 518.0, 853.0, 516.0], "score": 0.99, "text": "-statistic was calculated as:"}, {"category_id": 15, "poly": [782.0, 389.0, 1119.0, 389.0, 1119.0, 419.0, 782.0, 419.0], "score": 0.99, "text": "must be positive, a one tailed"}, {"category_id": 15, "poly": [1134.0, 389.0, 1381.0, 389.0, 1381.0, 419.0, 1134.0, 419.0], "score": 0.98, "text": "-test was applied. The"}, {"category_id": 15, "poly": [793.0, 415.0, 1086.0, 417.0, 1086.0, 454.0, 793.0, 451.0], "score": 0.96, "text": "-value was calculated as"}], "page_info": {"page_no": 4, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 1, "poly": [795.8170166015625, 1284.819091796875, 1393.7825927734375, 1284.819091796875, 1393.7825927734375, 1348.0101318359375, 795.8170166015625, 1348.0101318359375], "score": 0.999998927116394}, {"category_id": 0, "poly": [796.8157348632812, 1217.8375244140625, 1043.37890625, 1217.8375244140625, 1043.37890625, 1247.9854736328125, 796.8157348632812, 1247.9854736328125], "score": 0.9999985694885254}, {"category_id": 2, "poly": [129.85670471191406, 196.5391082763672, 165.74688720703125, 196.5391082763672, 165.74688720703125, 216.1761932373047, 129.85670471191406, 216.1761932373047], "score": 0.9999985098838806}, {"category_id": 6, "poly": [129.43519592285156, 1376.390380859375, 374.6851806640625, 1376.390380859375, 374.6851806640625, 1428.6553955078125, 129.43519592285156, 1428.6553955078125], "score": 0.9999979734420776}, {"category_id": 0, "poly": [796.9744873046875, 1150.6444091796875, 911.8010864257812, 1150.6444091796875, 911.8010864257812, 1181.93017578125, 796.9744873046875, 1181.93017578125], "score": 0.9999944567680359}, {"category_id": 1, "poly": [793.93212890625, 785.8515625, 1394.5316162109375, 785.8515625, 1394.5316162109375, 1081.7857666015625, 793.93212890625, 1081.7857666015625], "score": 0.9999939203262329}, {"category_id": 5, "poly": [123.9719009399414, 1433.21337890625, 1400.553466796875, 1433.21337890625, 1400.553466796875, 1814.204345703125, 123.9719009399414, 1814.204345703125], "score": 0.9999938011169434}, {"category_id": 1, "poly": [130.5817108154297, 786.2269897460938, 730.632080078125, 786.2269897460938, 730.632080078125, 1349.01904296875, 130.5817108154297, 1349.01904296875], "score": 0.9999901652336121}, {"category_id": 6, "poly": [128.56288146972656, 253.8047637939453, 514.1275024414062, 253.8047637939453, 514.1275024414062, 308.0131530761719, 128.56288146972656, 308.0131530761719], "score": 0.9999885559082031}, {"category_id": 5, "poly": [126.36900329589844, 314.1026611328125, 1399.912109375, 314.1026611328125, 1399.912109375, 690.7048950195312, 126.36900329589844, 690.7048950195312], "score": 0.9999723434448242}, {"category_id": 2, "poly": [479.1275939941406, 195.43199157714844, 1044.5283203125, 195.43199157714844, 1044.5283203125, 218.68853759765625, 479.1275939941406, 218.68853759765625], "score": 0.9999346733093262}, {"category_id": 7, "poly": [128.99925231933594, 698.5426635742188, 1394.448486328125, 698.5426635742188, 1394.448486328125, 749.8440551757812, 128.99925231933594, 749.8440551757812], "score": 0.9987799525260925}, {"category_id": 7, "poly": [127.37924194335938, 1819.0853271484375, 1038.7354736328125, 1819.0853271484375, 1038.7354736328125, 1844.94091796875, 127.37924194335938, 1844.94091796875], "score": 0.9987504482269287}, {"category_id": 13, "poly": [626, 696, 660, 696, 660, 720, 626, 720], "score": 0.86, "latex": "5\\%"}, {"category_id": 13, "poly": [1190, 697, 1224, 697, 1224, 720, 1190, 720], "score": 0.86, "latex": "5\\%"}, {"category_id": 13, "poly": [299, 724, 342, 724, 342, 748, 299, 748], "score": 0.85, "latex": "10\\%"}, {"category_id": 13, "poly": [128, 698, 146, 698, 146, 719, 128, 719], "score": 0.69, "latex": "P"}, {"category_id": 13, "poly": [719, 697, 737, 697, 737, 719, 719, 719], "score": 0.44, "latex": "T"}, {"category_id": 13, "poly": [356, 1404, 375, 1404, 375, 1426, 356, 1426], "score": 0.33, "latex": "E"}, {"category_id": 15, "poly": [827.0, 1286.0, 1396.0, 1286.0, 1396.0, 1316.0, 827.0, 1316.0], "score": 0.95, "text": "The fit of the complete model, Eq. (2), to the"}, {"category_id": 15, "poly": [795.0, 1322.0, 1394.0, 1322.0, 1394.0, 1352.0, 795.0, 1352.0], "score": 0.98, "text": "observed data was generally good. Table 2 gives"}, {"category_id": 15, "poly": [795.0, 1221.0, 1046.0, 1221.0, 1046.0, 1251.0, 795.0, 1251.0], "score": 1.0, "text": "4.1. Model evaluation"}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "258"}, {"category_id": 15, "poly": [131.0, 1378.0, 204.0, 1378.0, 204.0, 1402.0, 131.0, 1402.0], "score": 0.93, "text": "Table 3"}, {"category_id": 15, "poly": [127.0, 1397.0, 355.0, 1400.0, 355.0, 1436.0, 127.0, 1434.0], "score": 0.97, "text": "Coefficient of efficiency,"}, {"category_id": 15, "poly": [790.0, 1150.0, 915.0, 1150.0, 915.0, 1189.0, 790.0, 1189.0], "score": 1.0, "text": "4. Results"}, {"category_id": 15, "poly": [793.0, 787.0, 1396.0, 787.0, 1396.0, 817.0, 793.0, 817.0], "score": 0.99, "text": "and Redhill the lower BFI is matched by the shallow"}, {"category_id": 15, "poly": [793.0, 821.0, 1398.0, 821.0, 1398.0, 849.0, 793.0, 849.0], "score": 0.99, "text": "soils. Pre-treatment data is not available for all"}, {"category_id": 15, "poly": [793.0, 854.0, 1398.0, 854.0, 1398.0, 884.0, 793.0, 884.0], "score": 0.98, "text": "catchment in the data set, so it was decided for the"}, {"category_id": 15, "poly": [793.0, 886.0, 1396.0, 886.0, 1396.0, 916.0, 793.0, 916.0], "score": 1.0, "text": "sake of consistency in the analysis to start each of the"}, {"category_id": 15, "poly": [795.0, 920.0, 1396.0, 920.0, 1396.0, 950.0, 795.0, 950.0], "score": 0.98, "text": "data sets in the year of treatment. The FDCs were"}, {"category_id": 15, "poly": [795.0, 955.0, 1396.0, 955.0, 1396.0, 985.0, 795.0, 985.0], "score": 0.99, "text": "constructed for water years of May-April for eight"}, {"category_id": 15, "poly": [793.0, 987.0, 1396.0, 987.0, 1396.0, 1017.0, 793.0, 1017.0], "score": 1.0, "text": "catchments. The 2 Cathedral Peak catchments were"}, {"category_id": 15, "poly": [790.0, 1019.0, 1398.0, 1019.0, 1398.0, 1049.0, 790.0, 1049.0], "score": 0.98, "text": "analysed for November-October because of the"}, {"category_id": 15, "poly": [790.0, 1054.0, 1192.0, 1054.0, 1192.0, 1084.0, 790.0, 1084.0], "score": 1.0, "text": "summer rainfall maxima (Table 2)."}, {"category_id": 15, "poly": [163.0, 785.0, 732.0, 785.0, 732.0, 815.0, 163.0, 815.0], "score": 0.98, "text": "Data on soil characteristics have been obtained"}, {"category_id": 15, "poly": [125.0, 815.0, 735.0, 817.0, 734.0, 854.0, 125.0, 851.0], "score": 1.0, "text": "from published reports and personal communication"}, {"category_id": 15, "poly": [125.0, 849.0, 732.0, 851.0, 732.0, 888.0, 125.0, 886.0], "score": 1.0, "text": "with researchers, but is far from uniform, particularly"}, {"category_id": 15, "poly": [124.0, 886.0, 734.0, 881.0, 735.0, 918.0, 125.0, 922.0], "score": 1.0, "text": "regarding porosity. Consequently only an indication"}, {"category_id": 15, "poly": [129.0, 920.0, 732.0, 920.0, 732.0, 950.0, 129.0, 950.0], "score": 0.99, "text": "of mean depth is reported here. However, this does"}, {"category_id": 15, "poly": [125.0, 950.0, 732.0, 952.0, 732.0, 989.0, 125.0, 987.0], "score": 0.99, "text": " give some indication of the likely relative storage"}, {"category_id": 15, "poly": [129.0, 987.0, 732.0, 987.0, 732.0, 1017.0, 129.0, 1017.0], "score": 0.99, "text": "capacities of the catchments. To obtain insights into"}, {"category_id": 15, "poly": [129.0, 1021.0, 732.0, 1021.0, 732.0, 1051.0, 129.0, 1051.0], "score": 0.97, "text": "the  pre-afforestation  hydrologic  characteristics  a"}, {"category_id": 15, "poly": [129.0, 1054.0, 732.0, 1054.0, 732.0, 1084.0, 129.0, 1084.0], "score": 0.99, "text": "baseflow separation was performed on the daily"}, {"category_id": 15, "poly": [129.0, 1088.0, 732.0, 1088.0, 732.0, 1118.0, 129.0, 1118.0], "score": 0.97, "text": "fows for the first 3 years following disturbance,"}, {"category_id": 15, "poly": [129.0, 1118.0, 730.0, 1118.0, 730.0, 1148.0, 129.0, 1148.0], "score": 0.98, "text": "using the digital filtering method of Lyne and Hollick"}, {"category_id": 15, "poly": [129.0, 1152.0, 732.0, 1152.0, 732.0, 1182.0, 129.0, 1182.0], "score": 0.98, "text": "(1979) with a filter coefficient of 0.925 and three"}, {"category_id": 15, "poly": [125.0, 1185.0, 734.0, 1182.0, 735.0, 1219.0, 125.0, 1221.0], "score": 0.99, "text": " passes. The resultant average basefow index (BFI),"}, {"category_id": 15, "poly": [129.0, 1221.0, 730.0, 1221.0, 730.0, 1249.0, 129.0, 1249.0], "score": 0.98, "text": "the ratio of baseflow to total flow, is given in Table 1."}, {"category_id": 15, "poly": [129.0, 1251.0, 730.0, 1251.0, 730.0, 1281.0, 129.0, 1281.0], "score": 0.97, "text": "The Australian catchments display a notably"}, {"category_id": 15, "poly": [129.0, 1288.0, 732.0, 1288.0, 732.0, 1316.0, 129.0, 1316.0], "score": 0.98, "text": "lower BFI than the South African and New Zealand"}, {"category_id": 15, "poly": [127.0, 1320.0, 734.0, 1318.0, 735.0, 1348.0, 127.0, 1350.0], "score": 0.96, "text": "catchments. For Stewarts Creek, Pine Creek"}, {"category_id": 15, "poly": [129.0, 252.0, 208.0, 252.0, 208.0, 282.0, 129.0, 282.0], "score": 0.98, "text": "Table 2"}, {"category_id": 15, "poly": [129.0, 282.0, 513.0, 282.0, 513.0, 312.0, 129.0, 312.0], "score": 0.99, "text": "Significance of the rainfall and time terms"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [1225.0, 697.0, 1396.0, 697.0, 1396.0, 727.0, 1225.0, 727.0], "score": 0.98, "text": "level, * represents"}, {"category_id": 15, "poly": [129.0, 725.0, 298.0, 725.0, 298.0, 755.0, 129.0, 755.0], "score": 0.98, "text": "significance at the"}, {"category_id": 15, "poly": [343.0, 725.0, 941.0, 725.0, 941.0, 755.0, 343.0, 755.0], "score": 0.99, "text": "level, and na denotes too few data points for meaningful analysis."}, {"category_id": 15, "poly": [147.0, 697.0, 625.0, 697.0, 625.0, 727.0, 147.0, 727.0], "score": 0.98, "text": " indicates that the rainfall term was significant at the"}, {"category_id": 15, "poly": [661.0, 697.0, 718.0, 697.0, 718.0, 727.0, 661.0, 727.0], "score": 1.0, "text": "level,"}, {"category_id": 15, "poly": [738.0, 697.0, 1189.0, 697.0, 1189.0, 727.0, 738.0, 727.0], "score": 0.99, "text": "indicates that the time term was significant at the"}, {"category_id": 15, "poly": [129.0, 1821.0, 1037.0, 1821.0, 1037.0, 1849.0, 129.0, 1849.0], "score": 0.99, "text": "ns Indicates that no solution was found, and na denotes deciles with too few data points for analysis"}], "page_info": {"page_no": 5, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 2, "poly": [1345.084228515625, 193.99124145507812, 1383.04443359375, 193.99124145507812, 1383.04443359375, 217.28871154785156, 1345.084228515625, 217.28871154785156], "score": 0.9999984502792358}, {"category_id": 1, "poly": [778.7415161132812, 875.8572387695312, 1385.70263671875, 875.8572387695312, 1385.70263671875, 1045.03857421875, 778.7415161132812, 1045.03857421875], "score": 0.9999930262565613}, {"category_id": 1, "poly": [112.97018432617188, 850.864990234375, 721.0302124023438, 850.864990234375, 721.0302124023438, 1216.21875, 112.97018432617188, 1216.21875], "score": 0.9999922513961792}, {"category_id": 4, "poly": [777.7315673828125, 753.8668212890625, 1386.6640625, 753.8668212890625, 1386.6640625, 842.7579345703125, 777.7315673828125, 842.7579345703125], "score": 0.9999915957450867}, {"category_id": 1, "poly": [777.9397583007812, 1045.28857421875, 1386.9669189453125, 1045.28857421875, 1386.9669189453125, 1678.6064453125, 777.9397583007812, 1678.6064453125], "score": 0.9999915957450867}, {"category_id": 1, "poly": [112.64908599853516, 250.50961303710938, 720.302001953125, 250.50961303710938, 720.302001953125, 849.3114624023438, 112.64908599853516, 849.3114624023438], "score": 0.9999906420707703}, {"category_id": 1, "poly": [112.41944122314453, 1315.5491943359375, 721.3580932617188, 1315.5491943359375, 721.3580932617188, 1851.324462890625, 112.41944122314453, 1851.324462890625], "score": 0.9999880790710449}, {"category_id": 3, "poly": [776.4273681640625, 253.75418090820312, 1388.254638671875, 253.75418090820312, 1388.254638671875, 736.9627685546875, 776.4273681640625, 736.9627685546875], "score": 0.9999828338623047}, {"category_id": 2, "poly": [464.4588928222656, 193.45211791992188, 1032.725341796875, 193.45211791992188, 1032.725341796875, 219.19715881347656, 464.4588928222656, 219.19715881347656], "score": 0.9999587535858154}, {"category_id": 0, "poly": [115.3223876953125, 1251.6119384765625, 695.34326171875, 1251.6119384765625, 695.34326171875, 1287.6334228515625, 115.3223876953125, 1287.6334228515625], "score": 0.9989659786224365}, {"category_id": 1, "poly": [778.8644409179688, 1705.2630615234375, 1386.922119140625, 1705.2630615234375, 1386.922119140625, 1843.95654296875, 778.8644409179688, 1843.95654296875], "score": 0.99659264087677}, {"category_id": 13, "poly": [601, 1814, 711, 1814, 711, 1847, 601, 1847], "score": 0.9, "latex": "T{=}\\,2T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [878, 1079, 975, 1079, 975, 1110, 878, 1110], "score": 0.9, "latex": "Y/(Y+a)"}, {"category_id": 13, "poly": [780, 880, 833, 880, 833, 911, 780, 911], "score": 0.89, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [296, 319, 380, 319, 380, 349, 296, 349], "score": 0.89, "latex": "E\\!>\\!0.7"}, {"category_id": 13, "poly": [160, 1682, 231, 1682, 231, 1713, 160, 1713], "score": 0.88, "latex": "a+Y)"}, {"category_id": 13, "poly": [116, 320, 188, 320, 188, 351, 116, 351], "score": 0.88, "latex": "(77\\%)"}, {"category_id": 13, "poly": [268, 751, 324, 751, 324, 781, 268, 781], "score": 0.87, "latex": "80\\%"}, {"category_id": 13, "poly": [628, 585, 684, 585, 684, 615, 628, 615], "score": 0.87, "latex": "75\\%"}, {"category_id": 13, "poly": [602, 619, 644, 619, 644, 647, 602, 647], "score": 0.85, "latex": "9\\%"}, {"category_id": 13, "poly": [533, 784, 577, 784, 577, 814, 533, 814], "score": 0.83, "latex": "9\\%"}, {"category_id": 13, "poly": [323, 1384, 364, 1384, 364, 1412, 323, 1412], "score": 0.77, "latex": "\\Delta P"}, {"category_id": 13, "poly": [286, 852, 308, 852, 308, 879, 286, 879], "score": 0.75, "latex": "E"}, {"category_id": 13, "poly": [409, 885, 432, 885, 432, 912, 409, 912], "score": 0.71, "latex": "E"}, {"category_id": 13, "poly": [566, 1085, 590, 1085, 590, 1112, 566, 1112], "score": 0.7, "latex": "E"}, {"category_id": 13, "poly": [484, 254, 524, 254, 524, 284, 484, 284], "score": 0.7, "latex": "(E)"}, {"category_id": 13, "poly": [315, 919, 334, 919, 334, 946, 315, 946], "score": 0.66, "latex": "b"}, {"category_id": 13, "poly": [376, 587, 394, 587, 394, 614, 376, 614], "score": 0.62, "latex": "b"}, {"category_id": 13, "poly": [460, 1051, 478, 1051, 478, 1077, 460, 1077], "score": 0.59, "latex": "b"}, {"category_id": 13, "poly": [451, 319, 552, 319, 552, 350, 451, 350], "score": 0.46, "latex": "60\\%~0.8"}, {"category_id": 13, "poly": [498, 719, 522, 719, 522, 746, 498, 746], "score": 0.45, "latex": "Y"}, {"category_id": 15, "poly": [1342.0, 191.0, 1387.0, 191.0, 1387.0, 236.0, 1342.0, 236.0], "score": 1.0, "text": "259"}, {"category_id": 15, "poly": [780.0, 914.0, 1383.0, 914.0, 1383.0, 944.0, 780.0, 944.0], "score": 0.99, "text": "most deciles the adjusted FDCs are identical for 12"}, {"category_id": 15, "poly": [780.0, 948.0, 1381.0, 948.0, 1381.0, 978.0, 780.0, 978.0], "score": 0.98, "text": "and 20 years after treatment. This figure clearly"}, {"category_id": 15, "poly": [782.0, 983.0, 1381.0, 983.0, 1381.0, 1013.0, 782.0, 1013.0], "score": 0.98, "text": "demonstrates the necessity for FDC adjustment,"}, {"category_id": 15, "poly": [773.0, 1013.0, 1168.0, 1010.0, 1168.0, 1047.0, 773.0, 1049.0], "score": 0.99, "text": " particularly for the 20 years FDC."}, {"category_id": 15, "poly": [834.0, 882.0, 1383.0, 882.0, 1383.0, 912.0, 834.0, 912.0], "score": 0.99, "text": "values are given in Table 4. Fig. 3 shows that for"}, {"category_id": 15, "poly": [116.0, 955.0, 719.0, 955.0, 719.0, 985.0, 116.0, 985.0], "score": 0.98, "text": "general the model fits the higher flows (lower deciles)"}, {"category_id": 15, "poly": [116.0, 987.0, 719.0, 987.0, 719.0, 1017.0, 116.0, 1017.0], "score": 1.0, "text": "better, most of the poorer fits are in the 80-100"}, {"category_id": 15, "poly": [112.0, 1017.0, 722.0, 1015.0, 722.0, 1051.0, 112.0, 1054.0], "score": 0.97, "text": " percentile range. This can be expected given the results"}, {"category_id": 15, "poly": [116.0, 1120.0, 722.0, 1120.0, 722.0, 1150.0, 116.0, 1150.0], "score": 0.99, "text": "Glendhu 2 and for 10th and 20th percentiles from"}, {"category_id": 15, "poly": [114.0, 1150.0, 724.0, 1150.0, 724.0, 1187.0, 114.0, 1187.0], "score": 0.98, "text": "Cathedral Peak 3 may exaggerate the goodness of fit to"}, {"category_id": 15, "poly": [114.0, 1187.0, 646.0, 1187.0, 646.0, 1215.0, 114.0, 1215.0], "score": 0.98, "text": "the exact form of the model (Lane et al., 2003)."}, {"category_id": 15, "poly": [150.0, 854.0, 285.0, 854.0, 285.0, 884.0, 150.0, 884.0], "score": 1.0, "text": "The poorest"}, {"category_id": 15, "poly": [309.0, 854.0, 715.0, 854.0, 715.0, 884.0, 309.0, 884.0], "score": 0.99, "text": "values were those from Lambrechts-"}, {"category_id": 15, "poly": [116.0, 886.0, 408.0, 886.0, 408.0, 916.0, 116.0, 916.0], "score": 0.95, "text": "bos A and B. The high"}, {"category_id": 15, "poly": [433.0, 886.0, 719.0, 886.0, 719.0, 916.0, 433.0, 916.0], "score": 0.96, "text": "for 50-100th deciles at"}, {"category_id": 15, "poly": [114.0, 1088.0, 565.0, 1088.0, 565.0, 1118.0, 114.0, 1118.0], "score": 0.97, "text": "sensitivity analysis suggested that the"}, {"category_id": 15, "poly": [591.0, 1088.0, 722.0, 1088.0, 722.0, 1118.0, 591.0, 1118.0], "score": 0.97, "text": "values for"}, {"category_id": 15, "poly": [116.0, 920.0, 314.0, 920.0, 314.0, 950.0, 116.0, 950.0], "score": 1.0, "text": "Biesievlei, where"}, {"category_id": 15, "poly": [335.0, 920.0, 719.0, 920.0, 719.0, 950.0, 335.0, 950.0], "score": 0.99, "text": "was not significant are notable. In"}, {"category_id": 15, "poly": [114.0, 1054.0, 459.0, 1054.0, 459.0, 1084.0, 114.0, 1084.0], "score": 0.95, "text": "of the significance tests for "}, {"category_id": 15, "poly": [479.0, 1054.0, 719.0, 1054.0, 719.0, 1084.0, 479.0, 1084.0], "score": 0.96, "text": ". The results of the"}, {"category_id": 15, "poly": [780.0, 759.0, 1383.0, 759.0, 1383.0, 789.0, 780.0, 789.0], "score": 0.98, "text": "Fig. 3. Examples of observed and fow duration curves adjusted for"}, {"category_id": 15, "poly": [782.0, 787.0, 1381.0, 787.0, 1381.0, 815.0, 782.0, 815.0], "score": 0.97, "text": "average rainfall following afforestation for Stewarts Creek 5,"}, {"category_id": 15, "poly": [779.0, 810.0, 873.0, 815.0, 871.0, 845.0, 777.0, 840.0], "score": 1.0, "text": "Australia."}, {"category_id": 15, "poly": [810.0, 1043.0, 1385.0, 1045.0, 1385.0, 1081.0, 810.0, 1079.0], "score": 0.98, "text": "The relative net flow change due to afforestation is"}, {"category_id": 15, "poly": [775.0, 1109.0, 1387.0, 1112.0, 1387.0, 1148.0, 775.0, 1146.0], "score": 0.99, "text": " old equilibrium water use condition of pre-treatment"}, {"category_id": 15, "poly": [777.0, 1146.0, 1387.0, 1142.0, 1388.0, 1178.0, 778.0, 1183.0], "score": 1.0, "text": "vegetation to the new equilibrium condition at forest"}, {"category_id": 15, "poly": [777.0, 1178.0, 1385.0, 1176.0, 1385.0, 1213.0, 778.0, 1215.0], "score": 0.99, "text": "canopy closure. This quantity is plotted for all catchments"}, {"category_id": 15, "poly": [780.0, 1215.0, 1385.0, 1215.0, 1385.0, 1245.0, 780.0, 1245.0], "score": 0.98, "text": "in Fig. 4. Some deciles have been removed from the data"}, {"category_id": 15, "poly": [782.0, 1247.0, 1383.0, 1247.0, 1383.0, 1277.0, 782.0, 1277.0], "score": 0.98, "text": "set, the 10th and 50th percentile for Glendhu 2 and the"}, {"category_id": 15, "poly": [782.0, 1281.0, 1383.0, 1281.0, 1383.0, 1312.0, 782.0, 1312.0], "score": 1.0, "text": "10th and 20th percentiles from Cathedral Peak 3. The"}, {"category_id": 15, "poly": [780.0, 1314.0, 1383.0, 1314.0, 1383.0, 1344.0, 780.0, 1344.0], "score": 0.98, "text": "optimised value of a was zero or near zero for these cases,"}, {"category_id": 15, "poly": [780.0, 1348.0, 1383.0, 1348.0, 1383.0, 1378.0, 780.0, 1378.0], "score": 1.0, "text": "which is not consistent with the conceptual model. The"}, {"category_id": 15, "poly": [775.0, 1378.0, 1385.0, 1376.0, 1385.0, 1413.0, 775.0, 1415.0], "score": 0.98, "text": " changes shown in Fig. 4 are variable. However, there are"}, {"category_id": 15, "poly": [780.0, 1415.0, 1385.0, 1415.0, 1385.0, 1445.0, 780.0, 1445.0], "score": 1.0, "text": "some commonalities between catchment responses. Two"}, {"category_id": 15, "poly": [775.0, 1447.0, 1387.0, 1445.0, 1387.0, 1481.0, 775.0, 1484.0], "score": 0.99, "text": "types of responses (groups) were identified. Group 1"}, {"category_id": 15, "poly": [780.0, 1481.0, 1385.0, 1481.0, 1385.0, 1511.0, 780.0, 1511.0], "score": 1.0, "text": "catchments show a substantial increase in the number of"}, {"category_id": 15, "poly": [777.0, 1514.0, 1387.0, 1514.0, 1387.0, 1550.0, 777.0, 1550.0], "score": 0.96, "text": " zero flow days, with a greater proportional reduction in"}, {"category_id": 15, "poly": [782.0, 1548.0, 1385.0, 1548.0, 1385.0, 1578.0, 782.0, 1578.0], "score": 0.99, "text": "low flows than high fows. Group 2 catchments show a"}, {"category_id": 15, "poly": [780.0, 1582.0, 1383.0, 1582.0, 1383.0, 1612.0, 780.0, 1612.0], "score": 0.99, "text": "more uniform proportional reduction in fows across all"}, {"category_id": 15, "poly": [777.0, 1617.0, 1383.0, 1615.0, 1383.0, 1645.0, 778.0, 1647.0], "score": 1.0, "text": "percentiles, albeit with some variability. The catchments"}, {"category_id": 15, "poly": [776.0, 1644.0, 980.0, 1649.0, 979.0, 1686.0, 775.0, 1681.0], "score": 0.95, "text": " in each group are:"}, {"category_id": 15, "poly": [780.0, 1079.0, 877.0, 1079.0, 877.0, 1116.0, 780.0, 1116.0], "score": 1.0, "text": "givenby"}, {"category_id": 15, "poly": [976.0, 1079.0, 1385.0, 1079.0, 1385.0, 1116.0, 976.0, 1116.0], "score": 0.98, "text": ", which represents the change from the"}, {"category_id": 15, "poly": [116.0, 288.0, 717.0, 288.0, 717.0, 318.0, 116.0, 318.0], "score": 0.99, "text": "percentile at all the catchments. The majority of fits"}, {"category_id": 15, "poly": [116.0, 355.0, 719.0, 355.0, 719.0, 385.0, 116.0, 385.0], "score": 0.99, "text": "significance of the rainfall and time terms is given in"}, {"category_id": 15, "poly": [114.0, 387.0, 717.0, 387.0, 717.0, 417.0, 114.0, 417.0], "score": 1.0, "text": "Table 3 for all deciles, where solutions were found."}, {"category_id": 15, "poly": [112.0, 417.0, 720.0, 421.0, 719.0, 452.0, 112.0, 447.0], "score": 0.98, "text": "There were not enough data to fit the model in five"}, {"category_id": 15, "poly": [116.0, 456.0, 717.0, 456.0, 717.0, 484.0, 116.0, 484.0], "score": 0.98, "text": "instances because of extended periods of zero flows."}, {"category_id": 15, "poly": [116.0, 488.0, 719.0, 488.0, 719.0, 518.0, 116.0, 518.0], "score": 0.99, "text": "This problem is addressed to some extent in the zero"}, {"category_id": 15, "poly": [116.0, 522.0, 719.0, 522.0, 719.0, 550.0, 116.0, 550.0], "score": 1.0, "text": "fow analysis. If the rainfall signal is to be separated"}, {"category_id": 15, "poly": [114.0, 555.0, 719.0, 555.0, 719.0, 585.0, 114.0, 585.0], "score": 0.98, "text": "from the vegetation signal the rainfall terms must be"}, {"category_id": 15, "poly": [110.0, 649.0, 722.0, 651.0, 722.0, 688.0, 109.0, 686.0], "score": 0.99, "text": " 0.10 level. The incidence of significance was greatest"}, {"category_id": 15, "poly": [116.0, 688.0, 719.0, 688.0, 719.0, 718.0, 116.0, 718.0], "score": 0.99, "text": "for the 10-50th percentiles at 45 of the 50 data sets at"}, {"category_id": 15, "poly": [112.0, 817.0, 432.0, 815.0, 432.0, 851.0, 112.0, 854.0], "score": 0.98, "text": "significant at the 0.10 level."}, {"category_id": 15, "poly": [189.0, 320.0, 295.0, 320.0, 295.0, 350.0, 189.0, 350.0], "score": 1.0, "text": "returned"}, {"category_id": 15, "poly": [116.0, 755.0, 267.0, 755.0, 267.0, 785.0, 116.0, 785.0], "score": 1.0, "text": "results, with"}, {"category_id": 15, "poly": [325.0, 755.0, 719.0, 755.0, 719.0, 785.0, 325.0, 785.0], "score": 0.97, "text": " of the deciles significant at 0.05"}, {"category_id": 15, "poly": [685.0, 589.0, 722.0, 589.0, 722.0, 619.0, 685.0, 619.0], "score": 1.0, "text": "of"}, {"category_id": 15, "poly": [114.0, 621.0, 601.0, 621.0, 601.0, 649.0, 114.0, 649.0], "score": 0.98, "text": "the deciles at the 0.05 level, and a further"}, {"category_id": 15, "poly": [645.0, 621.0, 719.0, 621.0, 719.0, 649.0, 645.0, 649.0], "score": 0.99, "text": "at the"}, {"category_id": 15, "poly": [116.0, 787.0, 532.0, 787.0, 532.0, 815.0, 116.0, 815.0], "score": 0.96, "text": "level. There were an additional"}, {"category_id": 15, "poly": [578.0, 787.0, 719.0, 787.0, 719.0, 815.0, 578.0, 815.0], "score": 0.99, "text": "of deciles"}, {"category_id": 15, "poly": [110.0, 249.0, 483.0, 252.0, 483.0, 288.0, 109.0, 286.0], "score": 0.91, "text": "the  coefficient of  efficiency "}, {"category_id": 15, "poly": [525.0, 249.0, 720.0, 252.0, 719.0, 288.0, 525.0, 286.0], "score": 0.96, "text": "for each flow"}, {"category_id": 15, "poly": [116.0, 589.0, 375.0, 589.0, 375.0, 619.0, 116.0, 619.0], "score": 1.0, "text": "significant. This term,"}, {"category_id": 15, "poly": [395.0, 589.0, 627.0, 589.0, 627.0, 619.0, 395.0, 619.0], "score": 0.98, "text": ", was significant for"}, {"category_id": 15, "poly": [381.0, 320.0, 450.0, 320.0, 450.0, 350.0, 381.0, 350.0], "score": 0.9, "text": "\uff0cwith"}, {"category_id": 15, "poly": [553.0, 320.0, 719.0, 320.0, 719.0, 350.0, 553.0, 350.0], "score": 0.97, "text": "or better. The"}, {"category_id": 15, "poly": [116.0, 718.0, 497.0, 718.0, 497.0, 748.0, 116.0, 748.0], "score": 0.99, "text": "the 0.05 level. The time term,"}, {"category_id": 15, "poly": [523.0, 718.0, 717.0, 718.0, 717.0, 748.0, 523.0, 748.0], "score": 0.97, "text": "returned similar"}, {"category_id": 15, "poly": [148.0, 1318.0, 719.0, 1318.0, 719.0, 1348.0, 148.0, 1348.0], "score": 0.95, "text": "Following the successful fitting of (2) to the"}, {"category_id": 15, "poly": [114.0, 1352.0, 719.0, 1352.0, 719.0, 1382.0, 114.0, 1382.0], "score": 0.99, "text": "observed percentiles, the FDCs were adjusted for"}, {"category_id": 15, "poly": [114.0, 1419.0, 719.0, 1419.0, 719.0, 1447.0, 114.0, 1447.0], "score": 0.99, "text": "average annual rainfall. The climate adjusted FDCs"}, {"category_id": 15, "poly": [114.0, 1453.0, 719.0, 1453.0, 719.0, 1481.0, 114.0, 1481.0], "score": 0.96, "text": "produce an estimation of the change in flow"}, {"category_id": 15, "poly": [112.0, 1486.0, 722.0, 1483.0, 722.0, 1514.0, 112.0, 1516.0], "score": 0.98, "text": "percentiles over time for each catchment due to"}, {"category_id": 15, "poly": [116.0, 1518.0, 719.0, 1518.0, 719.0, 1548.0, 116.0, 1548.0], "score": 0.99, "text": "afforestation that may be viewed in two forms: new"}, {"category_id": 15, "poly": [114.0, 1552.0, 722.0, 1552.0, 722.0, 1580.0, 114.0, 1580.0], "score": 0.99, "text": "FDCs, adjusted for climate, as exemplified in Fig. 3"}, {"category_id": 15, "poly": [112.0, 1587.0, 721.0, 1582.0, 722.0, 1612.0, 112.0, 1617.0], "score": 0.99, "text": "for Stewarts Creek 5, and a comparison between all"}, {"category_id": 15, "poly": [114.0, 1619.0, 717.0, 1619.0, 717.0, 1649.0, 114.0, 1649.0], "score": 0.99, "text": "catchments of the maximum change in yield (given by"}, {"category_id": 15, "poly": [118.0, 1651.0, 722.0, 1651.0, 722.0, 1681.0, 118.0, 1681.0], "score": 0.99, "text": "Y) for each flow percentile from baseline flows (given"}, {"category_id": 15, "poly": [118.0, 1718.0, 715.0, 1718.0, 715.0, 1748.0, 118.0, 1748.0], "score": 0.98, "text": "equilibrium of maximum water use is reached, the"}, {"category_id": 15, "poly": [116.0, 1750.0, 719.0, 1750.0, 719.0, 1780.0, 116.0, 1780.0], "score": 0.99, "text": "adjusted FDCs for individual years should be identical"}, {"category_id": 15, "poly": [114.0, 1784.0, 719.0, 1784.0, 719.0, 1815.0, 114.0, 1815.0], "score": 0.98, "text": "if rainfall variability has been accounted for. The new"}, {"category_id": 15, "poly": [110.0, 1808.0, 600.0, 1813.0, 600.0, 1856.0, 109.0, 1851.0], "score": 0.96, "text": " equilibrium is approximately reached for "}, {"category_id": 15, "poly": [116.0, 1686.0, 159.0, 1686.0, 159.0, 1716.0, 116.0, 1716.0], "score": 1.0, "text": "by"}, {"category_id": 15, "poly": [232.0, 1686.0, 719.0, 1686.0, 719.0, 1716.0, 232.0, 1716.0], "score": 0.95, "text": " as shown in Fig. 4. Where the new"}, {"category_id": 15, "poly": [116.0, 1387.0, 322.0, 1387.0, 322.0, 1417.0, 116.0, 1417.0], "score": 1.0, "text": "climate by setting"}, {"category_id": 15, "poly": [365.0, 1387.0, 719.0, 1387.0, 719.0, 1417.0, 365.0, 1417.0], "score": 0.99, "text": "to zero, representing long term"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [112.0, 1249.0, 694.0, 1251.0, 694.0, 1288.0, 112.0, 1286.0], "score": 0.99, "text": "4.2. Adjusted FDCs\u2014magnitude of fow reductions"}, {"category_id": 15, "poly": [782.0, 1709.0, 1325.0, 1709.0, 1325.0, 1739.0, 782.0, 1739.0], "score": 0.99, "text": "Group 1: Stewarts Creek, Pine Creek, and Redhill"}, {"category_id": 15, "poly": [782.0, 1744.0, 1381.0, 1744.0, 1381.0, 1774.0, 782.0, 1774.0], "score": 0.99, "text": "Group 2: Cathedral Peak 2 and 3, Lambrechtsbos A,"}, {"category_id": 15, "poly": [889.0, 1776.0, 1383.0, 1776.0, 1383.0, 1806.0, 889.0, 1806.0], "score": 1.0, "text": "Lambrechtsbos B, Glendhu 2, Biesievlei and"}, {"category_id": 15, "poly": [889.0, 1810.0, 1072.0, 1810.0, 1072.0, 1840.0, 889.0, 1840.0], "score": 1.0, "text": "Traralgon Creek"}], "page_info": {"page_no": 6, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 6, "poly": [130.38211059570312, 1373.58056640625, 408.3111877441406, 1373.58056640625, 408.3111877441406, 1427.8253173828125, 130.38211059570312, 1427.8253173828125], "score": 0.9999985694885254}, {"category_id": 2, "poly": [131.90994262695312, 195.1804962158203, 165.77700805664062, 195.1804962158203, 165.77700805664062, 215.41661071777344, 131.90994262695312, 215.41661071777344], "score": 0.9999985098838806}, {"category_id": 2, "poly": [481.0845642089844, 195.8048095703125, 1043.9552001953125, 195.8048095703125, 1043.9552001953125, 218.32778930664062, 481.0845642089844, 218.32778930664062], "score": 0.9999977350234985}, {"category_id": 5, "poly": [124.61016845703125, 1434.242919921875, 1399.2454833984375, 1434.242919921875, 1399.2454833984375, 1811.951171875, 124.61016845703125, 1811.951171875], "score": 0.9999969005584717}, {"category_id": 4, "poly": [510.5360107421875, 734.0053100585938, 1013.9042358398438, 734.0053100585938, 1013.9042358398438, 758.7108154296875, 510.5360107421875, 758.7108154296875], "score": 0.9999968409538269}, {"category_id": 1, "poly": [131.32168579101562, 838.9521484375, 730.0957641601562, 838.9521484375, 730.0957641601562, 1314.5084228515625, 131.32168579101562, 1314.5084228515625], "score": 0.9999938011169434}, {"category_id": 3, "poly": [306.1774597167969, 253.64524841308594, 1219.746337890625, 253.64524841308594, 1219.746337890625, 705.4325561523438, 306.1774597167969, 705.4325561523438], "score": 0.9999911785125732}, {"category_id": 1, "poly": [794.51171875, 907.3822631835938, 1395.8782958984375, 907.3822631835938, 1395.8782958984375, 1313.7686767578125, 794.51171875, 1313.7686767578125], "score": 0.9999873042106628}, {"category_id": 7, "poly": [127.02899169921875, 1816.2164306640625, 940.5137939453125, 1816.2164306640625, 940.5137939453125, 1842.17822265625, 127.02899169921875, 1842.17822265625], "score": 0.999592661857605}, {"category_id": 0, "poly": [794.436767578125, 838.79541015625, 1117.543701171875, 838.79541015625, 1117.543701171875, 867.4995727539062, 794.436767578125, 867.4995727539062], "score": 0.9990140795707703}, {"category_id": 13, "poly": [759, 733, 840, 733, 840, 759, 759, 759], "score": 0.9, "latex": "Y/(Y+a)"}, {"category_id": 13, "poly": [815, 1077, 867, 1077, 867, 1108, 815, 1108], "score": 0.89, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [1088, 1179, 1140, 1179, 1140, 1211, 1088, 1211], "score": 0.89, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [130, 1247, 196, 1247, 196, 1277, 130, 1277], "score": 0.84, "latex": "100\\%"}, {"category_id": 13, "poly": [209, 1042, 276, 1042, 276, 1072, 209, 1072], "score": 0.84, "latex": "100\\%"}, {"category_id": 13, "poly": [1174, 940, 1224, 940, 1224, 971, 1174, 971], "score": 0.84, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [129, 1401, 172, 1401, 172, 1428, 129, 1428], "score": 0.7, "latex": "T_{\\mathrm{half}}"}, {"category_id": 15, "poly": [129.0, 1372.0, 208.0, 1372.0, 208.0, 1402.0, 129.0, 1402.0], "score": 0.93, "text": "Table 4"}, {"category_id": 15, "poly": [173.0, 1400.0, 408.0, 1397.0, 408.0, 1434.0, 173.0, 1436.0], "score": 1.0, "text": "(years) for all catchments"}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "260"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [509.0, 733.0, 758.0, 733.0, 758.0, 763.0, 509.0, 763.0], "score": 0.98, "text": "Fig. 4. Net flow reductions"}, {"category_id": 15, "poly": [841.0, 733.0, 1012.0, 733.0, 1012.0, 763.0, 841.0, 763.0], "score": 1.0, "text": "for all catchments."}, {"category_id": 15, "poly": [157.0, 836.0, 737.0, 834.0, 737.0, 871.0, 157.0, 873.0], "score": 0.99, "text": "Group 1 exhibit both the highest reduction of"}, {"category_id": 15, "poly": [127.0, 871.0, 732.0, 871.0, 732.0, 907.0, 127.0, 907.0], "score": 0.98, "text": "flows overall, and show the largest proportional"}, {"category_id": 15, "poly": [129.0, 909.0, 732.0, 909.0, 732.0, 940.0, 129.0, 940.0], "score": 0.96, "text": "reduction at lower flows, leading to a complete"}, {"category_id": 15, "poly": [129.0, 944.0, 732.0, 944.0, 732.0, 974.0, 129.0, 974.0], "score": 0.99, "text": "cessation of fow. Comparison of flow reductions is"}, {"category_id": 15, "poly": [125.0, 972.0, 735.0, 974.0, 734.0, 1011.0, 125.0, 1008.0], "score": 0.99, "text": "hindered slightly by the range of afforestation at the"}, {"category_id": 15, "poly": [129.0, 1010.0, 735.0, 1010.0, 735.0, 1041.0, 129.0, 1041.0], "score": 0.99, "text": "catchments (Table 1). These results could be scaled"}, {"category_id": 15, "poly": [129.0, 1079.0, 730.0, 1079.0, 730.0, 1109.0, 129.0, 1109.0], "score": 0.98, "text": "linear relationship between the area planted and flow"}, {"category_id": 15, "poly": [129.0, 1114.0, 732.0, 1114.0, 732.0, 1144.0, 129.0, 1144.0], "score": 0.98, "text": "reductions. As there is no evidence that this is the"}, {"category_id": 15, "poly": [129.0, 1146.0, 728.0, 1146.0, 728.0, 1176.0, 129.0, 1176.0], "score": 0.98, "text": "case we have not presented scaled reductions here."}, {"category_id": 15, "poly": [125.0, 1178.0, 737.0, 1176.0, 737.0, 1213.0, 125.0, 1215.0], "score": 0.95, "text": "Linear  scaling  would  shift  the  reduction  curves"}, {"category_id": 15, "poly": [125.0, 1213.0, 737.0, 1210.0, 737.0, 1247.0, 125.0, 1249.0], "score": 0.96, "text": "upward for those catchments that are less than"}, {"category_id": 15, "poly": [125.0, 1281.0, 492.0, 1286.0, 492.0, 1320.0, 124.0, 1316.0], "score": 0.98, "text": " of the curves or our groupings."}, {"category_id": 15, "poly": [197.0, 1245.0, 735.0, 1247.0, 734.0, 1284.0, 197.0, 1281.0], "score": 0.98, "text": " afforested, but would not change the shape"}, {"category_id": 15, "poly": [129.0, 1045.0, 208.0, 1045.0, 208.0, 1075.0, 129.0, 1075.0], "score": 0.99, "text": "upto"}, {"category_id": 15, "poly": [277.0, 1045.0, 735.0, 1045.0, 735.0, 1075.0, 277.0, 1075.0], "score": 0.98, "text": " afforested if it is assumed there is a"}, {"category_id": 15, "poly": [825.0, 903.0, 1398.0, 905.0, 1398.0, 942.0, 825.0, 940.0], "score": 0.99, "text": "The speed of fow responses to afforestation can be"}, {"category_id": 15, "poly": [793.0, 976.0, 1398.0, 976.0, 1398.0, 1006.0, 793.0, 1006.0], "score": 0.99, "text": "is substantial variation in response times both over the"}, {"category_id": 15, "poly": [788.0, 1008.0, 1402.0, 1004.0, 1403.0, 1041.0, 788.0, 1045.0], "score": 0.99, "text": " percentile spread in some individual catchments, and"}, {"category_id": 15, "poly": [790.0, 1045.0, 1398.0, 1045.0, 1398.0, 1075.0, 790.0, 1075.0], "score": 0.98, "text": " between the catchments. The majority of responses have"}, {"category_id": 15, "poly": [793.0, 1114.0, 1398.0, 1114.0, 1398.0, 1142.0, 793.0, 1142.0], "score": 0.99, "text": "Stewarts Creek, Redhill and Lambrechtsbos A exhibit the"}, {"category_id": 15, "poly": [793.0, 1148.0, 1398.0, 1148.0, 1398.0, 1178.0, 793.0, 1178.0], "score": 0.99, "text": "fastest responses, with Biesievlei showing the most"}, {"category_id": 15, "poly": [793.0, 1215.0, 1398.0, 1215.0, 1398.0, 1245.0, 793.0, 1245.0], "score": 0.99, "text": "catchments display a good correspondence to published"}, {"category_id": 15, "poly": [793.0, 1249.0, 1394.0, 1249.0, 1394.0, 1279.0, 793.0, 1279.0], "score": 0.99, "text": "annual changes (Scott et al., 2000; Van Wyk, 1987),"}, {"category_id": 15, "poly": [793.0, 1284.0, 1398.0, 1284.0, 1398.0, 1314.0, 793.0, 1314.0], "score": 0.99, "text": "excepting the 10-20th deciles for both Cathedral Peak"}, {"category_id": 15, "poly": [790.0, 1079.0, 814.0, 1079.0, 814.0, 1109.0, 790.0, 1109.0], "score": 0.96, "text": "a"}, {"category_id": 15, "poly": [868.0, 1079.0, 1398.0, 1079.0, 1398.0, 1109.0, 868.0, 1109.0], "score": 0.99, "text": "value between 5 and 10 years. Pine Creek and"}, {"category_id": 15, "poly": [788.0, 1178.0, 1087.0, 1176.0, 1087.0, 1213.0, 788.0, 1215.0], "score": 1.0, "text": "uniformly slow response."}, {"category_id": 15, "poly": [1141.0, 1178.0, 1400.0, 1176.0, 1400.0, 1213.0, 1141.0, 1215.0], "score": 0.94, "text": "for the South African"}, {"category_id": 15, "poly": [793.0, 942.0, 1173.0, 942.0, 1173.0, 972.0, 793.0, 972.0], "score": 1.0, "text": "evaluated by examining the value of"}, {"category_id": 15, "poly": [1225.0, 942.0, 1398.0, 942.0, 1398.0, 972.0, 1225.0, 972.0], "score": 0.97, "text": "(Table 4). There"}, {"category_id": 15, "poly": [129.0, 1815.0, 939.0, 1817.0, 939.0, 1847.0, 129.0, 1845.0], "score": 0.98, "text": "Note that no solution could be found for the 50 percentile for Glendhu indicted by the ns."}, {"category_id": 15, "poly": [793.0, 838.0, 1123.0, 838.0, 1123.0, 875.0, 793.0, 875.0], "score": 1.0, "text": "4.3. Timing of fow reductions"}], "page_info": {"page_no": 7, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [781.8721923828125, 1618.0399169921875, 928.7534790039062, 1618.0399169921875, 928.7534790039062, 1646.409912109375, 781.8721923828125, 1646.409912109375], "score": 0.9999984502792358}, {"category_id": 6, "poly": [113.6822738647461, 253.5222625732422, 1383.692626953125, 253.5222625732422, 1383.692626953125, 334.94287109375, 113.6822738647461, 334.94287109375], "score": 0.9999982714653015}, {"category_id": 2, "poly": [1345.8267822265625, 196.56283569335938, 1379.01416015625, 196.56283569335938, 1379.01416015625, 215.46395874023438, 1345.8267822265625, 215.46395874023438], "score": 0.999997615814209}, {"category_id": 2, "poly": [467.5944519042969, 195.73922729492188, 1030.1298828125, 195.73922729492188, 1030.1298828125, 218.0099639892578, 467.5944519042969, 218.0099639892578], "score": 0.9999966621398926}, {"category_id": 1, "poly": [117.23767852783203, 1243.1109619140625, 716.3004150390625, 1243.1109619140625, 716.3004150390625, 1847.4835205078125, 117.23767852783203, 1847.4835205078125], "score": 0.9999922513961792}, {"category_id": 1, "poly": [118.47014617919922, 994.3162841796875, 713.6297607421875, 994.3162841796875, 713.6297607421875, 1122.2752685546875, 118.47014617919922, 1122.2752685546875], "score": 0.9999918937683105}, {"category_id": 0, "poly": [119.10287475585938, 1178.6876220703125, 627.3995971679688, 1178.6876220703125, 627.3995971679688, 1206.4453125, 119.10287475585938, 1206.4453125], "score": 0.9999906420707703}, {"category_id": 1, "poly": [782.2766723632812, 1682.373291015625, 1380.46728515625, 1682.373291015625, 1380.46728515625, 1845.99609375, 782.2766723632812, 1845.99609375], "score": 0.9999901652336121}, {"category_id": 1, "poly": [781.3034057617188, 1060.36083984375, 1379.3385009765625, 1060.36083984375, 1379.3385009765625, 1521.511962890625, 781.3034057617188, 1521.511962890625], "score": 0.9999886751174927}, {"category_id": 5, "poly": [113.5753402709961, 341.08380126953125, 1386.5994873046875, 341.08380126953125, 1386.5994873046875, 689.2748413085938, 113.5753402709961, 689.2748413085938], "score": 0.9999498128890991}, {"category_id": 7, "poly": [118.46651458740234, 696.7362670898438, 1380.3681640625, 696.7362670898438, 1380.3681640625, 861.0716552734375, 118.46651458740234, 861.0716552734375], "score": 0.9997767210006714}, {"category_id": 0, "poly": [782.720458984375, 993.1400146484375, 988.5659790039062, 993.1400146484375, 988.5659790039062, 1020.3793334960938, 782.720458984375, 1020.3793334960938], "score": 0.9996650218963623}, {"category_id": 13, "poly": [458, 778, 601, 778, 601, 806, 458, 806], "score": 0.91, "latex": "\\sum Y/\\sum(a+Y)"}, {"category_id": 13, "poly": [169, 1025, 221, 1025, 221, 1056, 169, 1056], "score": 0.91, "latex": "T_{\\mathrm{half}}"}, {"category_id": 13, "poly": [464, 750, 607, 750, 607, 778, 464, 778], "score": 0.88, "latex": "\\sum Y/\\sum(a+Y)"}, {"category_id": 13, "poly": [1201, 1191, 1277, 1191, 1277, 1221, 1201, 1221], "score": 0.88, "latex": "\\Delta N_{\\mathrm{zero}}"}, {"category_id": 13, "poly": [1296, 1323, 1350, 1323, 1350, 1353, 1296, 1353], "score": 0.86, "latex": "50\\%"}, {"category_id": 13, "poly": [1078, 1159, 1101, 1159, 1101, 1185, 1078, 1185], "score": 0.77, "latex": "E"}, {"category_id": 13, "poly": [1113, 1192, 1133, 1192, 1133, 1219, 1113, 1219], "score": 0.69, "latex": "b"}, {"category_id": 13, "poly": [375, 811, 390, 811, 390, 830, 375, 830], "score": 0.67, "latex": "a"}, {"category_id": 13, "poly": [990, 1196, 1003, 1196, 1003, 1218, 990, 1218], "score": 0.61, "latex": "t\\cdot"}, {"category_id": 13, "poly": [1066, 812, 1080, 812, 1080, 830, 1066, 830], "score": 0.58, "latex": "a"}, {"category_id": 13, "poly": [431, 808, 448, 808, 448, 830, 431, 830], "score": 0.46, "latex": "Y"}, {"category_id": 13, "poly": [1246, 1357, 1283, 1357, 1283, 1386, 1246, 1386], "score": 0.43, "latex": "\\mathrm{Ck}"}, {"category_id": 13, "poly": [773, 779, 827, 779, 827, 804, 773, 804], "score": 0.42, "latex": "100\\mathrm{th}"}, {"category_id": 13, "poly": [1107, 1357, 1144, 1357, 1144, 1386, 1107, 1386], "score": 0.41, "latex": "\\mathrm{Ck}"}, {"category_id": 13, "poly": [640, 807, 684, 807, 684, 831, 640, 831], "score": 0.29, "latex": "20\\mathrm{th}"}, {"category_id": 15, "poly": [776.0, 1612.0, 935.0, 1617.0, 933.0, 1656.0, 775.0, 1651.0], "score": 0.97, "text": " 5. Discussion"}, {"category_id": 15, "poly": [112.0, 252.0, 195.0, 252.0, 195.0, 282.0, 112.0, 282.0], "score": 0.98, "text": "Table5"}, {"category_id": 15, "poly": [112.0, 279.0, 1383.0, 282.0, 1383.0, 312.0, 112.0, 310.0], "score": 0.99, "text": " Published fow reductions from paired catchment analyses, after Scott et al. (2000), Hickel (2001), Nandakumar and Mein (1993) and Fahey and"}, {"category_id": 15, "poly": [112.0, 307.0, 681.0, 310.0, 681.0, 340.0, 112.0, 338.0], "score": 0.97, "text": "Jackson (1997) compared to estimated reductions in this study"}, {"category_id": 15, "poly": [1330.0, 202.0, 1368.0, 177.0, 1393.0, 215.0, 1355.0, 240.0], "score": 0.99, "text": "261"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [148.0, 1243.0, 720.0, 1245.0, 719.0, 1275.0, 148.0, 1273.0], "score": 0.98, "text": "A further check on the overall model performance is"}, {"category_id": 15, "poly": [112.0, 1279.0, 719.0, 1277.0, 720.0, 1307.0, 112.0, 1309.0], "score": 0.98, "text": " a comparison with published results of paired catchment"}, {"category_id": 15, "poly": [114.0, 1312.0, 719.0, 1312.0, 719.0, 1342.0, 114.0, 1342.0], "score": 1.0, "text": "studies. The data that can be compared with our results"}, {"category_id": 15, "poly": [114.0, 1346.0, 719.0, 1346.0, 719.0, 1376.0, 114.0, 1376.0], "score": 0.98, "text": "are presented in Table 5 and can be broadly compared"}, {"category_id": 15, "poly": [114.0, 1380.0, 719.0, 1380.0, 719.0, 1410.0, 114.0, 1410.0], "score": 0.99, "text": "with Fig. 4. These data are reductions in years with near"}, {"category_id": 15, "poly": [114.0, 1415.0, 722.0, 1415.0, 722.0, 1445.0, 114.0, 1445.0], "score": 0.99, "text": "average annual rainfall, and at a time after treatment "}, {"category_id": 15, "poly": [114.0, 1445.0, 717.0, 1445.0, 717.0, 1475.0, 114.0, 1475.0], "score": 0.99, "text": "when maximum changes in streamflow have occurred."}, {"category_id": 15, "poly": [114.0, 1481.0, 719.0, 1481.0, 719.0, 1509.0, 114.0, 1509.0], "score": 0.99, "text": "Table 5 also includes estimates on the total and low flow"}, {"category_id": 15, "poly": [114.0, 1511.0, 719.0, 1511.0, 719.0, 1542.0, 114.0, 1542.0], "score": 0.99, "text": "reductions calculated from this study. Results from Pine"}, {"category_id": 15, "poly": [114.0, 1548.0, 719.0, 1548.0, 719.0, 1578.0, 114.0, 1578.0], "score": 0.98, "text": "Creek and Traralgon Creek are not included in Table 5"}, {"category_id": 15, "poly": [114.0, 1582.0, 722.0, 1582.0, 722.0, 1612.0, 114.0, 1612.0], "score": 1.0, "text": "as these catchments are not paired. Exact comparisons"}, {"category_id": 15, "poly": [116.0, 1615.0, 719.0, 1615.0, 719.0, 1645.0, 116.0, 1645.0], "score": 0.99, "text": "are impossible because of the rainfall variability, and"}, {"category_id": 15, "poly": [116.0, 1649.0, 717.0, 1649.0, 717.0, 1679.0, 116.0, 1679.0], "score": 0.99, "text": "lack of calibration period for Redhill. Despite this,"}, {"category_id": 15, "poly": [116.0, 1681.0, 719.0, 1681.0, 719.0, 1711.0, 116.0, 1711.0], "score": 0.95, "text": "Table 5 shows that total and low flow reductions"}, {"category_id": 15, "poly": [116.0, 1716.0, 719.0, 1716.0, 719.0, 1746.0, 116.0, 1746.0], "score": 0.99, "text": "estimated from our study are comparable to the results"}, {"category_id": 15, "poly": [116.0, 1750.0, 719.0, 1750.0, 719.0, 1780.0, 116.0, 1780.0], "score": 0.99, "text": "from paired catchment studies, indicating that our"}, {"category_id": 15, "poly": [116.0, 1782.0, 719.0, 1782.0, 719.0, 1812.0, 116.0, 1812.0], "score": 0.99, "text": "simple model has successfully removed the rainfall"}, {"category_id": 15, "poly": [111.0, 1820.0, 190.0, 1814.0, 192.0, 1846.0, 113.0, 1852.0], "score": 0.94, "text": "signal."}, {"category_id": 15, "poly": [112.0, 989.0, 717.0, 987.0, 717.0, 1023.0, 112.0, 1026.0], "score": 1.0, "text": "catchments and the lower deciles at Lambrechtsbos B."}, {"category_id": 15, "poly": [116.0, 1062.0, 715.0, 1062.0, 715.0, 1092.0, 116.0, 1092.0], "score": 0.98, "text": "lower than other published data (Fahey and Jackson,"}, {"category_id": 15, "poly": [116.0, 1092.0, 189.0, 1092.0, 189.0, 1124.0, 116.0, 1124.0], "score": 0.96, "text": "1997)."}, {"category_id": 15, "poly": [116.0, 1028.0, 168.0, 1028.0, 168.0, 1058.0, 116.0, 1058.0], "score": 1.0, "text": "The"}, {"category_id": 15, "poly": [222.0, 1028.0, 715.0, 1028.0, 715.0, 1058.0, 222.0, 1058.0], "score": 1.0, "text": "from Glendhu 2 appears to be substantially"}, {"category_id": 15, "poly": [114.0, 1176.0, 631.0, 1176.0, 631.0, 1213.0, 114.0, 1213.0], "score": 0.99, "text": "4.4. Comparison with paired catchment studies"}, {"category_id": 15, "poly": [816.0, 1686.0, 1381.0, 1686.0, 1381.0, 1716.0, 816.0, 1716.0], "score": 0.98, "text": "The aims of the project have largely been met. The"}, {"category_id": 15, "poly": [782.0, 1720.0, 1383.0, 1720.0, 1383.0, 1750.0, 782.0, 1750.0], "score": 0.97, "text": "general characterisation of FDCs and adjustment for"}, {"category_id": 15, "poly": [782.0, 1750.0, 1385.0, 1750.0, 1385.0, 1787.0, 782.0, 1787.0], "score": 0.99, "text": "climate has been very encouraging given the task of"}, {"category_id": 15, "poly": [782.0, 1784.0, 1381.0, 1784.0, 1381.0, 1815.0, 782.0, 1815.0], "score": 0.98, "text": "fitting our model to 10 flow percentiles, for 10 different"}, {"category_id": 15, "poly": [778.0, 1812.0, 1383.0, 1815.0, 1383.0, 1851.0, 777.0, 1849.0], "score": 0.97, "text": "catchments (resulting in 100 model fits\uff09 with"}, {"category_id": 15, "poly": [816.0, 1060.0, 1381.0, 1060.0, 1381.0, 1090.0, 816.0, 1090.0], "score": 0.96, "text": "As this analysis could only be applied, where there"}, {"category_id": 15, "poly": [782.0, 1094.0, 1383.0, 1094.0, 1383.0, 1124.0, 782.0, 1124.0], "score": 1.0, "text": "was consistent drying up of streams, it was confined to"}, {"category_id": 15, "poly": [782.0, 1127.0, 1381.0, 1127.0, 1381.0, 1157.0, 782.0, 1157.0], "score": 0.99, "text": "Stewarts Creek, Pine Creek and Redhill catchments. The"}, {"category_id": 15, "poly": [780.0, 1228.0, 1383.0, 1228.0, 1383.0, 1256.0, 780.0, 1256.0], "score": 0.99, "text": "significant results at the 0.05 level for both parameters at"}, {"category_id": 15, "poly": [780.0, 1260.0, 1383.0, 1260.0, 1383.0, 1290.0, 780.0, 1290.0], "score": 0.97, "text": "all three catchments. The climate adjusted zero flow"}, {"category_id": 15, "poly": [780.0, 1292.0, 1383.0, 1292.0, 1383.0, 1322.0, 780.0, 1322.0], "score": 1.0, "text": "days are shown in Fig. 5. The increases in zero flow days"}, {"category_id": 15, "poly": [778.0, 1387.0, 1387.0, 1389.0, 1387.0, 1425.0, 777.0, 1423.0], "score": 0.97, "text": "11 at Redhill. The latter has changed from an almost "}, {"category_id": 15, "poly": [775.0, 1423.0, 1385.0, 1421.0, 1385.0, 1458.0, 775.0, 1460.0], "score": 0.97, "text": " permanent to a highly intermittent stream. The curves"}, {"category_id": 15, "poly": [775.0, 1453.0, 1385.0, 1456.0, 1385.0, 1492.0, 775.0, 1490.0], "score": 0.99, "text": " are also in sensible agreement with the flow reductions"}, {"category_id": 15, "poly": [777.0, 1492.0, 885.0, 1492.0, 885.0, 1522.0, 777.0, 1522.0], "score": 0.99, "text": "in Fig. 4."}, {"category_id": 15, "poly": [1278.0, 1193.0, 1383.0, 1193.0, 1383.0, 1223.0, 1278.0, 1223.0], "score": 0.98, "text": " returned"}, {"category_id": 15, "poly": [777.0, 1327.0, 1295.0, 1327.0, 1295.0, 1357.0, 777.0, 1357.0], "score": 0.98, "text": " are substantial with flows confined to less than"}, {"category_id": 15, "poly": [1351.0, 1327.0, 1385.0, 1327.0, 1385.0, 1357.0, 1351.0, 1357.0], "score": 1.0, "text": "of"}, {"category_id": 15, "poly": [780.0, 1161.0, 1077.0, 1161.0, 1077.0, 1189.0, 780.0, 1189.0], "score": 0.96, "text": "model returned values of"}, {"category_id": 15, "poly": [1102.0, 1161.0, 1381.0, 1161.0, 1381.0, 1189.0, 1102.0, 1189.0], "score": 0.98, "text": "of 0.95, 0.99 and 0.99,"}, {"category_id": 15, "poly": [1134.0, 1193.0, 1200.0, 1193.0, 1200.0, 1223.0, 1134.0, 1223.0], "score": 0.97, "text": " and"}, {"category_id": 15, "poly": [780.0, 1193.0, 989.0, 1193.0, 989.0, 1223.0, 780.0, 1223.0], "score": 0.98, "text": "respectively. The"}, {"category_id": 15, "poly": [1004.0, 1193.0, 1112.0, 1193.0, 1112.0, 1223.0, 1004.0, 1223.0], "score": 0.89, "text": "-testson"}, {"category_id": 15, "poly": [1284.0, 1354.0, 1385.0, 1357.0, 1385.0, 1393.0, 1284.0, 1391.0], "score": 1.0, "text": "and year"}, {"category_id": 15, "poly": [775.0, 1354.0, 1106.0, 1357.0, 1106.0, 1393.0, 775.0, 1391.0], "score": 0.98, "text": " the time by year 8 at Stewarts"}, {"category_id": 15, "poly": [1145.0, 1354.0, 1245.0, 1357.0, 1245.0, 1393.0, 1145.0, 1391.0], "score": 1.0, "text": "and Pine"}, {"category_id": 15, "poly": [125.0, 690.0, 1385.0, 694.0, 1385.0, 731.0, 125.0, 727.0], "score": 0.97, "text": "a Rainfall refers to the rainfall in the year used for comparison of results. The value in brackets refers to the deviation from the mean anual"}, {"category_id": 15, "poly": [114.0, 722.0, 408.0, 725.0, 408.0, 755.0, 114.0, 752.0], "score": 1.0, "text": "rainfall for the period of record."}, {"category_id": 15, "poly": [112.0, 832.0, 312.0, 834.0, 311.0, 864.0, 112.0, 862.0], "score": 0.99, "text": " 30-100th percentiles."}, {"category_id": 15, "poly": [125.0, 776.0, 457.0, 780.0, 457.0, 811.0, 125.0, 806.0], "score": 0.96, "text": "c Low flow reduction calculated by"}, {"category_id": 15, "poly": [122.0, 748.0, 463.0, 750.0, 463.0, 780.0, 122.0, 778.0], "score": 0.98, "text": "b Total flow reduction calculated by"}, {"category_id": 15, "poly": [608.0, 748.0, 743.0, 750.0, 743.0, 780.0, 608.0, 778.0], "score": 1.0, "text": "for all deciles."}, {"category_id": 15, "poly": [123.0, 800.0, 374.0, 806.0, 374.0, 843.0, 122.0, 836.0], "score": 0.98, "text": "d For Cathedral Peak 3 the"}, {"category_id": 15, "poly": [1081.0, 800.0, 1385.0, 806.0, 1385.0, 843.0, 1081.0, 836.0], "score": 0.99, "text": "were lower then the values of the"}, {"category_id": 15, "poly": [391.0, 800.0, 430.0, 806.0, 430.0, 843.0, 391.0, 836.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [602.0, 776.0, 772.0, 780.0, 772.0, 811.0, 602.0, 806.0], "score": 0.99, "text": "for 70, 80, 90 and"}, {"category_id": 15, "poly": [828.0, 776.0, 934.0, 780.0, 934.0, 811.0, 828.0, 806.0], "score": 1.0, "text": "percentiles."}, {"category_id": 15, "poly": [449.0, 800.0, 639.0, 806.0, 639.0, 843.0, 449.0, 836.0], "score": 0.98, "text": "values for the 10 and"}, {"category_id": 15, "poly": [685.0, 800.0, 1065.0, 806.0, 1065.0, 843.0, 685.0, 836.0], "score": 0.99, "text": "percentiles were excluded as the values of"}, {"category_id": 15, "poly": [778.0, 987.0, 993.0, 991.0, 992.0, 1030.0, 777.0, 1025.0], "score": 1.0, "text": "4.5. Zero fow days"}], "page_info": {"page_no": 8, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 4, "poly": [130.26629638671875, 1337.3250732421875, 732.2774047851562, 1337.3250732421875, 732.2774047851562, 1418.7119140625, 130.26629638671875, 1418.7119140625], "score": 0.9999995231628418}, {"category_id": 2, "poly": [131.43930053710938, 195.23593139648438, 165.80084228515625, 195.23593139648438, 165.80084228515625, 215.28358459472656, 131.43930053710938, 215.28358459472656], "score": 0.999997615814209}, {"category_id": 3, "poly": [131.99615478515625, 262.13848876953125, 732.1187744140625, 262.13848876953125, 732.1187744140625, 1316.8990478515625, 131.99615478515625, 1316.8990478515625], "score": 0.9999966621398926}, {"category_id": 2, "poly": [480.581787109375, 195.6566162109375, 1043.8951416015625, 195.6566162109375, 1043.8951416015625, 218.63613891601562, 480.581787109375, 218.63613891601562], "score": 0.9999943971633911}, {"category_id": 1, "poly": [794.3954467773438, 1085.8665771484375, 1394.706298828125, 1085.8665771484375, 1394.706298828125, 1847.09423828125, 794.3954467773438, 1847.09423828125], "score": 0.9999863505363464}, {"category_id": 1, "poly": [795.3126831054688, 256.7186279296875, 1395.0584716796875, 256.7186279296875, 1395.0584716796875, 1079.958251953125, 795.3126831054688, 1079.958251953125], "score": 0.9999836683273315}, {"category_id": 1, "poly": [130.74447631835938, 1445.3975830078125, 731.3636474609375, 1445.3975830078125, 731.3636474609375, 1846.5950927734375, 130.74447631835938, 1846.5950927734375], "score": 0.9999815225601196}, {"category_id": 13, "poly": [1045, 452, 1098, 452, 1098, 482, 1045, 482], "score": 0.87, "latex": "27\\%"}, {"category_id": 15, "poly": [129.0, 1339.0, 732.0, 1339.0, 732.0, 1367.0, 129.0, 1367.0], "score": 0.98, "text": "Fig. 5. Number of zero fow days for average rainfall following"}, {"category_id": 15, "poly": [131.0, 1365.0, 730.0, 1365.0, 730.0, 1393.0, 131.0, 1393.0], "score": 0.98, "text": "afforestation for Stewarts Creek 5, Redhill and Pine Creek,"}, {"category_id": 15, "poly": [133.0, 1398.0, 219.0, 1398.0, 219.0, 1421.0, 133.0, 1421.0], "score": 1.0, "text": "Australia."}, {"category_id": 15, "poly": [127.0, 189.0, 172.0, 189.0, 172.0, 228.0, 127.0, 228.0], "score": 0.97, "text": "262"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [827.0, 1086.0, 1398.0, 1086.0, 1398.0, 1116.0, 827.0, 1116.0], "score": 0.99, "text": "The model fits show we have quantified the net"}, {"category_id": 15, "poly": [793.0, 1120.0, 1396.0, 1120.0, 1396.0, 1150.0, 793.0, 1150.0], "score": 0.98, "text": "impact of afforestation for the majority of the flow"}, {"category_id": 15, "poly": [786.0, 1150.0, 1398.0, 1146.0, 1398.0, 1182.0, 786.0, 1187.0], "score": 0.99, "text": " percentiles in the 10 catchments. Results for the 10-50th"}, {"category_id": 15, "poly": [788.0, 1185.0, 1400.0, 1180.0, 1400.0, 1217.0, 788.0, 1221.0], "score": 0.98, "text": " percentiles were particularly encouraging. It is not"}, {"category_id": 15, "poly": [793.0, 1219.0, 1396.0, 1219.0, 1396.0, 1249.0, 793.0, 1249.0], "score": 0.99, "text": "surprising that the relationship between rainfall and flow"}, {"category_id": 15, "poly": [793.0, 1251.0, 1396.0, 1251.0, 1396.0, 1281.0, 793.0, 1281.0], "score": 0.97, "text": "diminishes at lower fows (60-100th percentile), where"}, {"category_id": 15, "poly": [793.0, 1286.0, 1396.0, 1286.0, 1396.0, 1316.0, 793.0, 1316.0], "score": 0.98, "text": "seasonal storage effects and rainfall distribution become"}, {"category_id": 15, "poly": [788.0, 1318.0, 1396.0, 1318.0, 1396.0, 1348.0, 788.0, 1348.0], "score": 0.97, "text": " more important drivers for runoff generation. The"}, {"category_id": 15, "poly": [790.0, 1352.0, 1398.0, 1352.0, 1398.0, 1382.0, 790.0, 1382.0], "score": 0.99, "text": "poorest model fits were gained for Lambrechtsbos A"}, {"category_id": 15, "poly": [788.0, 1382.0, 1396.0, 1382.0, 1396.0, 1413.0, 788.0, 1413.0], "score": 0.98, "text": " and B. The likely reason at Lambrechtsbos A is an"}, {"category_id": 15, "poly": [793.0, 1419.0, 1398.0, 1419.0, 1398.0, 1447.0, 793.0, 1447.0], "score": 0.98, "text": "observed annual decrease in stand water use after 12"}, {"category_id": 15, "poly": [793.0, 1451.0, 1398.0, 1451.0, 1398.0, 1481.0, 793.0, 1481.0], "score": 0.99, "text": "years (Scott et al., 2000) which does not conform to the"}, {"category_id": 15, "poly": [793.0, 1486.0, 1398.0, 1486.0, 1398.0, 1516.0, 793.0, 1516.0], "score": 0.99, "text": "sigmoidal form of our model over the full 19 years of"}, {"category_id": 15, "poly": [793.0, 1518.0, 1398.0, 1518.0, 1398.0, 1548.0, 793.0, 1548.0], "score": 0.99, "text": "record. The failure of the model to fit the lower flows at"}, {"category_id": 15, "poly": [786.0, 1546.0, 1400.0, 1548.0, 1400.0, 1585.0, 786.0, 1582.0], "score": 0.98, "text": " Lambrechtsbos B is not as explicable. A decrease in"}, {"category_id": 15, "poly": [790.0, 1587.0, 1398.0, 1587.0, 1398.0, 1615.0, 790.0, 1615.0], "score": 1.0, "text": "stand water use in this catchment is observed as the"}, {"category_id": 15, "poly": [793.0, 1619.0, 1400.0, 1619.0, 1400.0, 1649.0, 793.0, 1649.0], "score": 0.99, "text": "plantation ages, but does not occur during the first 20"}, {"category_id": 15, "poly": [793.0, 1651.0, 1398.0, 1651.0, 1398.0, 1681.0, 793.0, 1681.0], "score": 0.99, "text": "years after treatment (Scott et al., 2000). Other data from"}, {"category_id": 15, "poly": [788.0, 1679.0, 1398.0, 1681.0, 1398.0, 1718.0, 788.0, 1716.0], "score": 0.98, "text": " South Africa (Scott et al., 2000) indicate there are"}, {"category_id": 15, "poly": [791.0, 1711.0, 1398.0, 1716.0, 1398.0, 1752.0, 790.0, 1748.0], "score": 0.99, "text": " diminished flow reductions as plantations age, but again,"}, {"category_id": 15, "poly": [795.0, 1752.0, 1396.0, 1752.0, 1396.0, 1782.0, 795.0, 1782.0], "score": 0.99, "text": "generally after 20 years. Our use of an asymptotic curve"}, {"category_id": 15, "poly": [790.0, 1785.0, 1398.0, 1782.0, 1398.0, 1812.0, 790.0, 1815.0], "score": 0.98, "text": "assumes a new equilibrium of stand water use is"}, {"category_id": 15, "poly": [790.0, 1815.0, 1394.0, 1817.0, 1394.0, 1847.0, 790.0, 1845.0], "score": 0.99, "text": "reached. The results of the model fitting generally justify"}, {"category_id": 15, "poly": [788.0, 249.0, 1398.0, 252.0, 1398.0, 288.0, 788.0, 286.0], "score": 0.97, "text": " Lambrechtsbos B appear to be over-estimated by our"}, {"category_id": 15, "poly": [788.0, 284.0, 1398.0, 288.0, 1398.0, 325.0, 788.0, 320.0], "score": 0.99, "text": " model, which is unsurprising as the model fit was poor."}, {"category_id": 15, "poly": [793.0, 322.0, 1398.0, 322.0, 1398.0, 353.0, 793.0, 353.0], "score": 0.98, "text": "The remaining four South African catchments, and also"}, {"category_id": 15, "poly": [793.0, 355.0, 1398.0, 355.0, 1398.0, 385.0, 793.0, 385.0], "score": 0.99, "text": "Redhill and Stewarts Creek are in good agreement with"}, {"category_id": 15, "poly": [793.0, 389.0, 1400.0, 389.0, 1400.0, 417.0, 793.0, 417.0], "score": 1.0, "text": "the published values, particularly when the deviation of"}, {"category_id": 15, "poly": [793.0, 421.0, 1398.0, 421.0, 1398.0, 452.0, 793.0, 452.0], "score": 0.99, "text": "average rainfall is considered. Glendhu 2 reductions are"}, {"category_id": 15, "poly": [788.0, 488.0, 1396.0, 488.0, 1396.0, 518.0, 788.0, 518.0], "score": 0.99, "text": " a heavier impact on the lower flows. Overall, it appears"}, {"category_id": 15, "poly": [788.0, 518.0, 1400.0, 518.0, 1400.0, 555.0, 788.0, 555.0], "score": 0.99, "text": " there are no significant discrepancies with the published"}, {"category_id": 15, "poly": [793.0, 555.0, 1398.0, 555.0, 1398.0, 585.0, 793.0, 585.0], "score": 0.99, "text": "paired catchment analyses. We suggest our technique"}, {"category_id": 15, "poly": [790.0, 589.0, 1398.0, 589.0, 1398.0, 619.0, 790.0, 619.0], "score": 0.99, "text": " represents an alternative to the paired-catchment method"}, {"category_id": 15, "poly": [788.0, 617.0, 1398.0, 619.0, 1398.0, 656.0, 788.0, 654.0], "score": 0.98, "text": "for assessing hydrologic response to vegetation treat-"}, {"category_id": 15, "poly": [788.0, 651.0, 1400.0, 649.0, 1400.0, 686.0, 788.0, 688.0], "score": 1.0, "text": " ment, where paired data are unavailable. The method"}, {"category_id": 15, "poly": [793.0, 688.0, 1396.0, 688.0, 1396.0, 718.0, 793.0, 718.0], "score": 0.99, "text": "has not yet resulted in a predictive model, but has"}, {"category_id": 15, "poly": [795.0, 722.0, 1394.0, 722.0, 1394.0, 752.0, 795.0, 752.0], "score": 1.0, "text": "increased our knowledge of afforestation impacts. This"}, {"category_id": 15, "poly": [793.0, 755.0, 1398.0, 755.0, 1398.0, 785.0, 793.0, 785.0], "score": 0.96, "text": "is a valuable outcome given the contentious issue of"}, {"category_id": 15, "poly": [793.0, 787.0, 1398.0, 787.0, 1398.0, 817.0, 793.0, 817.0], "score": 0.98, "text": "afforestation in Australia and other countries, and a"}, {"category_id": 15, "poly": [792.0, 821.0, 1398.0, 819.0, 1398.0, 849.0, 793.0, 851.0], "score": 0.99, "text": "current paucity of data on inter-annual flows. It should"}, {"category_id": 15, "poly": [788.0, 849.0, 1396.0, 851.0, 1396.0, 888.0, 788.0, 886.0], "score": 0.99, "text": " be noted that nine of the 10 catchment were pine species."}, {"category_id": 15, "poly": [793.0, 888.0, 1398.0, 888.0, 1398.0, 918.0, 793.0, 918.0], "score": 0.98, "text": "More data is required to compare the impact of"}, {"category_id": 15, "poly": [790.0, 920.0, 1390.0, 920.0, 1390.0, 950.0, 790.0, 950.0], "score": 0.99, "text": "hardwood species, particularly eucalypts, on the FDC."}, {"category_id": 15, "poly": [793.0, 950.0, 1396.0, 955.0, 1396.0, 985.0, 792.0, 980.0], "score": 0.98, "text": "Unfortunately these data are currently scarce. There are"}, {"category_id": 15, "poly": [791.0, 980.0, 1398.0, 985.0, 1398.0, 1021.0, 790.0, 1017.0], "score": 1.0, "text": "substantial data on the physiological controls of eucalypt"}, {"category_id": 15, "poly": [793.0, 1019.0, 1396.0, 1019.0, 1396.0, 1049.0, 793.0, 1049.0], "score": 0.99, "text": "water use (see Whitehead and Beadle, 2004), but not at"}, {"category_id": 15, "poly": [790.0, 1054.0, 1016.0, 1054.0, 1016.0, 1084.0, 790.0, 1084.0], "score": 0.98, "text": "the catchment scale."}, {"category_id": 15, "poly": [788.0, 452.0, 1044.0, 452.0, 1044.0, 488.0, 788.0, 488.0], "score": 0.91, "text": " close  to the reported "}, {"category_id": 15, "poly": [1099.0, 452.0, 1398.0, 452.0, 1398.0, 488.0, 1099.0, 488.0], "score": 0.97, "text": ", but our model produces"}, {"category_id": 15, "poly": [129.0, 1443.0, 732.0, 1443.0, 732.0, 1479.0, 129.0, 1479.0], "score": 0.99, "text": "substantially varying spatial scales, soils and geology,"}, {"category_id": 15, "poly": [129.0, 1479.0, 732.0, 1479.0, 732.0, 1509.0, 129.0, 1509.0], "score": 0.99, "text": "species planted and climatic environments. Although"}, {"category_id": 15, "poly": [127.0, 1511.0, 735.0, 1511.0, 735.0, 1548.0, 127.0, 1548.0], "score": 0.98, "text": "there were poor results for individual deciles, the FDCs "}, {"category_id": 15, "poly": [129.0, 1548.0, 732.0, 1548.0, 732.0, 1578.0, 129.0, 1578.0], "score": 0.99, "text": "at eight of the 10 catchments were adequately described"}, {"category_id": 15, "poly": [127.0, 1580.0, 730.0, 1578.0, 730.0, 1608.0, 127.0, 1610.0], "score": 0.99, "text": "by Eq. (2). The results of the statistical tests in which the"}, {"category_id": 15, "poly": [129.0, 1612.0, 726.0, 1612.0, 726.0, 1643.0, 129.0, 1643.0], "score": 0.99, "text": "rainfall term was significant for most deciles demon-"}, {"category_id": 15, "poly": [127.0, 1647.0, 735.0, 1647.0, 735.0, 1683.0, 127.0, 1683.0], "score": 0.99, "text": "strated the model structure was appropriate for adjusting"}, {"category_id": 15, "poly": [129.0, 1683.0, 730.0, 1683.0, 730.0, 1711.0, 129.0, 1711.0], "score": 0.96, "text": "the FDCs for climatic (rainfall) variability. The"}, {"category_id": 15, "poly": [129.0, 1716.0, 732.0, 1716.0, 732.0, 1746.0, 129.0, 1746.0], "score": 0.95, "text": "comparisons of our results with published paired"}, {"category_id": 15, "poly": [129.0, 1750.0, 732.0, 1750.0, 732.0, 1780.0, 129.0, 1780.0], "score": 0.98, "text": "catchment analyses are satisfactory, although the"}, {"category_id": 15, "poly": [129.0, 1784.0, 735.0, 1784.0, 735.0, 1812.0, 129.0, 1812.0], "score": 0.98, "text": "different methodologies make direct comparisons of"}, {"category_id": 15, "poly": [127.0, 1815.0, 737.0, 1817.0, 737.0, 1847.0, 127.0, 1845.0], "score": 0.98, "text": "deciles with total fow uncertain. Low flows at"}], "page_info": {"page_no": 9, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 2, "poly": [466.4325256347656, 194.8888397216797, 1031.1922607421875, 194.8888397216797, 1031.1922607421875, 219.64439392089844, 466.4325256347656, 219.64439392089844], "score": 0.9999977350234985}, {"category_id": 0, "poly": [781.4110107421875, 1350.9322509765625, 1112.086181640625, 1350.9322509765625, 1112.086181640625, 1380.4071044921875, 781.4110107421875, 1380.4071044921875], "score": 0.9999973773956299}, {"category_id": 1, "poly": [118.7479248046875, 587.0300903320312, 715.7766723632812, 587.0300903320312, 715.7766723632812, 883.0694580078125, 118.7479248046875, 883.0694580078125], "score": 0.9999968409538269}, {"category_id": 1, "poly": [118.48811340332031, 252.823486328125, 715.749267578125, 252.823486328125, 715.749267578125, 583.28515625, 118.48811340332031, 583.28515625], "score": 0.9999964237213135}, {"category_id": 1, "poly": [117.62772369384766, 885.8139038085938, 717.3323974609375, 885.8139038085938, 717.3323974609375, 1415.2767333984375, 117.62772369384766, 1415.2767333984375], "score": 0.9999961853027344}, {"category_id": 1, "poly": [782.490234375, 254.01434326171875, 1380.5517578125, 254.01434326171875, 1380.5517578125, 748.8712768554688, 782.490234375, 748.8712768554688], "score": 0.9999944567680359}, {"category_id": 1, "poly": [117.28860473632812, 1415.5831298828125, 716.8341064453125, 1415.5831298828125, 716.8341064453125, 1847.5146484375, 117.28860473632812, 1847.5146484375], "score": 0.9999933242797852}, {"category_id": 1, "poly": [781.5156860351562, 752.15576171875, 1380.3497314453125, 752.15576171875, 1380.3497314453125, 1279.9158935546875, 781.5156860351562, 1279.9158935546875], "score": 0.9999922513961792}, {"category_id": 1, "poly": [781.4845581054688, 1417.2979736328125, 1380.813232421875, 1417.2979736328125, 1380.813232421875, 1845.5704345703125, 781.4845581054688, 1845.5704345703125], "score": 0.9999920725822449}, {"category_id": 2, "poly": [1346.2413330078125, 196.15005493164062, 1380.82568359375, 196.15005493164062, 1380.82568359375, 216.4473876953125, 1346.2413330078125, 216.4473876953125], "score": 0.9999884366989136}, {"category_id": 13, "poly": [510, 1017, 563, 1017, 563, 1047, 510, 1047], "score": 0.89, "latex": "85\\%"}, {"category_id": 13, "poly": [1121, 321, 1143, 321, 1143, 347, 1121, 347], "score": 0.55, "latex": "E"}, {"category_id": 13, "poly": [433, 354, 456, 354, 456, 380, 433, 380], "score": 0.47, "latex": "E."}, {"category_id": 13, "poly": [578, 1018, 683, 1018, 683, 1048, 578, 1048], "score": 0.39, "latex": "1260\\,\\mathrm{mm}"}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [780.0, 1350.0, 1117.0, 1350.0, 1117.0, 1387.0, 780.0, 1387.0], "score": 0.99, "text": "6. Summary and conclusions"}, {"category_id": 15, "poly": [150.0, 585.0, 717.0, 585.0, 717.0, 615.0, 150.0, 615.0], "score": 0.98, "text": "The small Australian catchments converted to pine in"}, {"category_id": 15, "poly": [109.0, 619.0, 724.0, 617.0, 724.0, 654.0, 110.0, 656.0], "score": 1.0, "text": "response group 1 (Stewarts Creek 5, Pine Creek and"}, {"category_id": 15, "poly": [112.0, 649.0, 722.0, 651.0, 722.0, 688.0, 112.0, 686.0], "score": 0.99, "text": "Redhill) have similar shallow soils, potential evapo-"}, {"category_id": 15, "poly": [116.0, 688.0, 717.0, 688.0, 717.0, 718.0, 116.0, 718.0], "score": 0.99, "text": "transpiration and rainfall distribution (relatively uni-"}, {"category_id": 15, "poly": [116.0, 722.0, 717.0, 722.0, 717.0, 752.0, 116.0, 752.0], "score": 1.0, "text": "form) although Stewarts Creek is significantly wetter."}, {"category_id": 15, "poly": [116.0, 755.0, 719.0, 755.0, 719.0, 785.0, 116.0, 785.0], "score": 0.98, "text": "The combination of small catchment area and the"}, {"category_id": 15, "poly": [116.0, 787.0, 717.0, 787.0, 717.0, 817.0, 116.0, 817.0], "score": 0.99, "text": "increased transpirative demand that exceeds summer"}, {"category_id": 15, "poly": [114.0, 819.0, 720.0, 821.0, 719.0, 851.0, 114.0, 849.0], "score": 1.0, "text": "and autumn rainfall and stored water results in the large"}, {"category_id": 15, "poly": [116.0, 856.0, 638.0, 856.0, 638.0, 884.0, 116.0, 884.0], "score": 0.98, "text": "impact on lower flows, compared to high flows."}, {"category_id": 15, "poly": [114.0, 252.0, 722.0, 252.0, 722.0, 288.0, 114.0, 288.0], "score": 0.98, "text": "this assumption for the length of commercial plantation"}, {"category_id": 15, "poly": [114.0, 286.0, 719.0, 286.0, 719.0, 322.0, 114.0, 322.0], "score": 0.99, "text": "growth (up to 20 years) considered here. The physio-"}, {"category_id": 15, "poly": [116.0, 322.0, 719.0, 322.0, 719.0, 353.0, 116.0, 353.0], "score": 0.99, "text": "logical relationship between stand age and water use for"}, {"category_id": 15, "poly": [114.0, 385.0, 719.0, 385.0, 719.0, 421.0, 114.0, 421.0], "score": 1.0, "text": "thoroughly investigated, although Cornish and Vertessy"}, {"category_id": 15, "poly": [112.0, 417.0, 722.0, 419.0, 722.0, 456.0, 112.0, 454.0], "score": 0.99, "text": "(2001) and Roberts et al. (2001) have shown young"}, {"category_id": 15, "poly": [116.0, 456.0, 719.0, 456.0, 719.0, 486.0, 116.0, 486.0], "score": 1.0, "text": "mixed species eucalypt forests may use more water than"}, {"category_id": 15, "poly": [112.0, 484.0, 722.0, 486.0, 722.0, 522.0, 112.0, 520.0], "score": 0.99, "text": " mature stands, and Putahena and Cordery (2000) suggest "}, {"category_id": 15, "poly": [116.0, 522.0, 719.0, 522.0, 719.0, 553.0, 116.0, 553.0], "score": 0.99, "text": "maximum Pinus radiata water use may have been"}, {"category_id": 15, "poly": [114.0, 553.0, 655.0, 553.0, 655.0, 583.0, 114.0, 583.0], "score": 0.99, "text": "reached after 12 years, with a subsequent decline."}, {"category_id": 15, "poly": [116.0, 355.0, 432.0, 355.0, 432.0, 385.0, 116.0, 385.0], "score": 1.0, "text": "plantation species other than"}, {"category_id": 15, "poly": [457.0, 355.0, 719.0, 355.0, 719.0, 385.0, 457.0, 385.0], "score": 0.96, "text": "regnans have not been"}, {"category_id": 15, "poly": [146.0, 881.0, 720.0, 884.0, 719.0, 920.0, 146.0, 918.0], "score": 0.98, "text": " The magnitude of the response within Group 2 varies"}, {"category_id": 15, "poly": [116.0, 920.0, 717.0, 920.0, 717.0, 950.0, 116.0, 950.0], "score": 1.0, "text": "considerably, with greater reduction in flows in the two"}, {"category_id": 15, "poly": [116.0, 952.0, 717.0, 952.0, 717.0, 983.0, 116.0, 983.0], "score": 0.98, "text": "Cathedral Peak catchments, and Lambrechtsbos B."}, {"category_id": 15, "poly": [114.0, 987.0, 719.0, 987.0, 719.0, 1017.0, 114.0, 1017.0], "score": 0.99, "text": "Potential evaporation is in phase with rainfall at the"}, {"category_id": 15, "poly": [114.0, 1054.0, 722.0, 1054.0, 722.0, 1084.0, 114.0, 1084.0], "score": 0.98, "text": "average) of their rainfall in summer. The conjunction of"}, {"category_id": 15, "poly": [116.0, 1088.0, 722.0, 1088.0, 722.0, 1118.0, 116.0, 1118.0], "score": 0.99, "text": "peak demand and plant water availability may explain"}, {"category_id": 15, "poly": [114.0, 1118.0, 722.0, 1120.0, 722.0, 1150.0, 114.0, 1148.0], "score": 0.98, "text": "the high reductions relative to the remaining catchments "}, {"category_id": 15, "poly": [116.0, 1155.0, 722.0, 1155.0, 722.0, 1185.0, 116.0, 1185.0], "score": 0.98, "text": "in Group 2. In addition, the stocking density was"}, {"category_id": 15, "poly": [116.0, 1187.0, 717.0, 1187.0, 717.0, 1215.0, 116.0, 1215.0], "score": 0.97, "text": "described as \u2018abnormally dense\u2019 by Scott et al. (2000)."}, {"category_id": 15, "poly": [116.0, 1219.0, 719.0, 1219.0, 719.0, 1249.0, 116.0, 1249.0], "score": 1.0, "text": "Growth at Glendhu 2 was notably slow (Fahey and"}, {"category_id": 15, "poly": [112.0, 1247.0, 722.0, 1249.0, 722.0, 1286.0, 112.0, 1284.0], "score": 0.98, "text": " Jackson, 1997) and Lambrechtsbos A and Biesievlei are"}, {"category_id": 15, "poly": [116.0, 1286.0, 719.0, 1286.0, 719.0, 1316.0, 116.0, 1316.0], "score": 1.0, "text": "described as being within sub optimal growth zones"}, {"category_id": 15, "poly": [116.0, 1318.0, 719.0, 1318.0, 719.0, 1348.0, 116.0, 1348.0], "score": 0.99, "text": "(Scott and Smith, 1997) characterised by these authors"}, {"category_id": 15, "poly": [114.0, 1352.0, 719.0, 1352.0, 719.0, 1382.0, 114.0, 1382.0], "score": 0.99, "text": "as having relatively slow response times and lesser"}, {"category_id": 15, "poly": [109.0, 1383.0, 586.0, 1380.0, 586.0, 1417.0, 110.0, 1419.0], "score": 0.96, "text": " reductions that those at more optimal sites."}, {"category_id": 15, "poly": [112.0, 1017.0, 509.0, 1017.0, 509.0, 1054.0, 112.0, 1054.0], "score": 0.98, "text": " Cathedral Peak sites as they receive"}, {"category_id": 15, "poly": [684.0, 1017.0, 719.0, 1017.0, 719.0, 1054.0, 684.0, 1054.0], "score": 0.93, "text": "on"}, {"category_id": 15, "poly": [810.0, 249.0, 1383.0, 252.0, 1383.0, 288.0, 810.0, 286.0], "score": 1.0, "text": "Traralgon Creek would be expected to have both the"}, {"category_id": 15, "poly": [780.0, 286.0, 1385.0, 286.0, 1385.0, 322.0, 780.0, 322.0], "score": 0.98, "text": "most subdued flow reductions and longer response time"}, {"category_id": 15, "poly": [780.0, 355.0, 1385.0, 355.0, 1385.0, 385.0, 780.0, 385.0], "score": 0.97, "text": "uncertain vegetation record. Peak stand water use of a"}, {"category_id": 15, "poly": [777.0, 389.0, 1381.0, 389.0, 1381.0, 419.0, 777.0, 419.0], "score": 0.99, "text": "natural stand of this species is around 30 years."}, {"category_id": 15, "poly": [782.0, 421.0, 1383.0, 421.0, 1383.0, 452.0, 782.0, 452.0], "score": 0.98, "text": "Additionally in this large, \u2018real world\u2019 catchment,"}, {"category_id": 15, "poly": [780.0, 456.0, 1385.0, 456.0, 1385.0, 486.0, 780.0, 486.0], "score": 0.99, "text": "there is a continuous cycle of forest management"}, {"category_id": 15, "poly": [782.0, 488.0, 1385.0, 488.0, 1385.0, 518.0, 782.0, 518.0], "score": 0.98, "text": "which includes harvesting. A mixture of pasture and"}, {"category_id": 15, "poly": [784.0, 522.0, 1383.0, 522.0, 1383.0, 553.0, 784.0, 553.0], "score": 0.99, "text": "'scrub', which could represent significant understorey"}, {"category_id": 15, "poly": [780.0, 555.0, 1381.0, 555.0, 1381.0, 585.0, 780.0, 585.0], "score": 0.99, "text": "stands, were replaced by plantation species. Conse-"}, {"category_id": 15, "poly": [780.0, 589.0, 1385.0, 589.0, 1385.0, 619.0, 780.0, 619.0], "score": 0.99, "text": "quently the difference between pre and post treatment"}, {"category_id": 15, "poly": [777.0, 619.0, 1383.0, 619.0, 1383.0, 649.0, 777.0, 649.0], "score": 0.98, "text": "ET may be less than at other catchments. Reductions of"}, {"category_id": 15, "poly": [777.0, 651.0, 1385.0, 651.0, 1385.0, 688.0, 777.0, 688.0], "score": 1.0, "text": "this magnitude could be more readily expected in larger,"}, {"category_id": 15, "poly": [780.0, 688.0, 1383.0, 688.0, 1383.0, 718.0, 780.0, 718.0], "score": 0.99, "text": "multi land use catchments than the very high impacts"}, {"category_id": 15, "poly": [782.0, 720.0, 1293.0, 720.0, 1293.0, 750.0, 782.0, 750.0], "score": 0.98, "text": "estimated at the smaller Australian catchments."}, {"category_id": 15, "poly": [780.0, 322.0, 1120.0, 322.0, 1120.0, 353.0, 780.0, 353.0], "score": 0.93, "text": "because of the large area of "}, {"category_id": 15, "poly": [1144.0, 322.0, 1385.0, 322.0, 1385.0, 353.0, 1144.0, 353.0], "score": 0.99, "text": "regnans forest, and"}, {"category_id": 15, "poly": [146.0, 1412.0, 720.0, 1415.0, 719.0, 1451.0, 146.0, 1449.0], "score": 0.98, "text": " The response groups may be in part explained by the"}, {"category_id": 15, "poly": [109.0, 1449.0, 722.0, 1447.0, 722.0, 1483.0, 110.0, 1486.0], "score": 0.96, "text": "storage characteristics of the catchments. Accurate"}, {"category_id": 15, "poly": [114.0, 1486.0, 717.0, 1486.0, 717.0, 1516.0, 114.0, 1516.0], "score": 0.99, "text": "measures of storage are not available from the literature,"}, {"category_id": 15, "poly": [114.0, 1518.0, 719.0, 1518.0, 719.0, 1546.0, 114.0, 1546.0], "score": 0.98, "text": "but the soil depths and the baseflow index (Table 1) both"}, {"category_id": 15, "poly": [114.0, 1552.0, 722.0, 1552.0, 722.0, 1580.0, 114.0, 1580.0], "score": 0.97, "text": "show the three south eastern Australian catchments with"}, {"category_id": 15, "poly": [114.0, 1587.0, 722.0, 1587.0, 722.0, 1617.0, 114.0, 1617.0], "score": 0.98, "text": "the greatest reduction are likely to have the lowest"}, {"category_id": 15, "poly": [109.0, 1617.0, 722.0, 1615.0, 722.0, 1651.0, 110.0, 1653.0], "score": 0.99, "text": " storage capacity. The greater flow reductions, particu-"}, {"category_id": 15, "poly": [116.0, 1651.0, 717.0, 1651.0, 717.0, 1681.0, 116.0, 1681.0], "score": 0.97, "text": "larly for low flows, could be expected under these"}, {"category_id": 15, "poly": [116.0, 1686.0, 719.0, 1686.0, 719.0, 1716.0, 116.0, 1716.0], "score": 0.99, "text": "conditions. Inclusion of a storage term in the model is an"}, {"category_id": 15, "poly": [116.0, 1718.0, 719.0, 1718.0, 719.0, 1748.0, 116.0, 1748.0], "score": 0.99, "text": "obvious option for improving the analysis. However the"}, {"category_id": 15, "poly": [116.0, 1752.0, 719.0, 1752.0, 719.0, 1782.0, 116.0, 1782.0], "score": 0.98, "text": "addition of extra parameters would be at the cost of"}, {"category_id": 15, "poly": [116.0, 1784.0, 717.0, 1784.0, 717.0, 1815.0, 116.0, 1815.0], "score": 0.99, "text": "maintaining model simplicity, particularly as character-"}, {"category_id": 15, "poly": [116.0, 1817.0, 518.0, 1817.0, 518.0, 1847.0, 116.0, 1847.0], "score": 1.0, "text": "ising a transient storage is not trivial."}, {"category_id": 15, "poly": [816.0, 755.0, 1381.0, 755.0, 1381.0, 785.0, 816.0, 785.0], "score": 0.97, "text": "The analysis of zero flow days was successful,"}, {"category_id": 15, "poly": [782.0, 787.0, 1383.0, 787.0, 1383.0, 817.0, 782.0, 817.0], "score": 0.99, "text": "demonstrating that the impact on flow intermittence can"}, {"category_id": 15, "poly": [780.0, 819.0, 1383.0, 819.0, 1383.0, 849.0, 780.0, 849.0], "score": 1.0, "text": "be evaluated without of the entire FDC. This was helpful"}, {"category_id": 15, "poly": [777.0, 854.0, 1381.0, 854.0, 1381.0, 884.0, 777.0, 884.0], "score": 0.98, "text": " as the change in the higher percentiles (low flows) could"}, {"category_id": 15, "poly": [782.0, 886.0, 1381.0, 886.0, 1381.0, 916.0, 782.0, 916.0], "score": 0.97, "text": "not always be modelled. The results for the three"}, {"category_id": 15, "poly": [782.0, 920.0, 1381.0, 920.0, 1381.0, 950.0, 782.0, 950.0], "score": 0.98, "text": "catchments analysed are a rather stark indication of the"}, {"category_id": 15, "poly": [782.0, 955.0, 1383.0, 955.0, 1383.0, 985.0, 782.0, 985.0], "score": 0.99, "text": "potential for highly increased zero flow periods in small"}, {"category_id": 15, "poly": [782.0, 987.0, 1381.0, 987.0, 1381.0, 1017.0, 782.0, 1017.0], "score": 0.99, "text": "catchments, at least in south-eastern Australia. However,"}, {"category_id": 15, "poly": [778.0, 1015.0, 1383.0, 1019.0, 1383.0, 1054.0, 777.0, 1049.0], "score": 0.99, "text": "it should be noted these curves probably represent a"}, {"category_id": 15, "poly": [777.0, 1054.0, 1383.0, 1054.0, 1383.0, 1084.0, 777.0, 1084.0], "score": 0.97, "text": " maximum response as they are all derived from small"}, {"category_id": 15, "poly": [777.0, 1088.0, 1381.0, 1088.0, 1381.0, 1118.0, 777.0, 1118.0], "score": 0.97, "text": "catchments with small storage capacities and large"}, {"category_id": 15, "poly": [780.0, 1120.0, 1385.0, 1120.0, 1385.0, 1150.0, 780.0, 1150.0], "score": 0.99, "text": "percentages of afforestation. This method could be used"}, {"category_id": 15, "poly": [777.0, 1150.0, 1381.0, 1150.0, 1381.0, 1180.0, 777.0, 1180.0], "score": 1.0, "text": "to determine change in the occurrence of any given flow"}, {"category_id": 15, "poly": [780.0, 1187.0, 1383.0, 1187.0, 1383.0, 1217.0, 780.0, 1217.0], "score": 0.98, "text": "in response to afforestation; e.g. to determine the"}, {"category_id": 15, "poly": [775.0, 1215.0, 1387.0, 1217.0, 1387.0, 1253.0, 775.0, 1251.0], "score": 0.96, "text": "likelihood of maintaining a reservoir storage or an"}, {"category_id": 15, "poly": [782.0, 1253.0, 1381.0, 1253.0, 1381.0, 1284.0, 782.0, 1284.0], "score": 1.0, "text": "environmental fow that requires an average critical flow."}, {"category_id": 15, "poly": [810.0, 1412.0, 1385.0, 1417.0, 1385.0, 1453.0, 810.0, 1449.0], "score": 0.98, "text": "This project sought to (i) develop a method to remove"}, {"category_id": 15, "poly": [780.0, 1453.0, 1383.0, 1453.0, 1383.0, 1481.0, 780.0, 1481.0], "score": 1.0, "text": "the climate signal from streamflow records to identify"}, {"category_id": 15, "poly": [780.0, 1486.0, 1383.0, 1486.0, 1383.0, 1516.0, 780.0, 1516.0], "score": 0.98, "text": "the impact of vegetation on flow from afforested"}, {"category_id": 15, "poly": [780.0, 1518.0, 1385.0, 1518.0, 1385.0, 1548.0, 780.0, 1548.0], "score": 1.0, "text": "catchments, and (ii) quantify this impact on the flow"}, {"category_id": 15, "poly": [777.0, 1550.0, 1383.0, 1550.0, 1383.0, 1580.0, 777.0, 1580.0], "score": 0.99, "text": "duration curve. A simple model was proposed that"}, {"category_id": 15, "poly": [775.0, 1582.0, 1387.0, 1582.0, 1387.0, 1619.0, 775.0, 1619.0], "score": 0.98, "text": " considered the age of plantation and the annual rainfall"}, {"category_id": 15, "poly": [777.0, 1619.0, 1385.0, 1619.0, 1385.0, 1649.0, 777.0, 1649.0], "score": 0.98, "text": "to be the principal drivers for evapotranspiration. This"}, {"category_id": 15, "poly": [780.0, 1651.0, 1385.0, 1651.0, 1385.0, 1679.0, 780.0, 1679.0], "score": 0.99, "text": "model was fitted to the observed deciles of the FDC, and"}, {"category_id": 15, "poly": [778.0, 1681.0, 1381.0, 1686.0, 1381.0, 1716.0, 777.0, 1711.0], "score": 0.97, "text": "the climate signal was then removed from the stream-"}, {"category_id": 15, "poly": [780.0, 1716.0, 1387.0, 1716.0, 1387.0, 1752.0, 780.0, 1752.0], "score": 0.99, "text": "flow records by adjusting the FDC for average rainfall"}, {"category_id": 15, "poly": [777.0, 1748.0, 1385.0, 1746.0, 1385.0, 1782.0, 778.0, 1785.0], "score": 0.98, "text": "over the period of record. The model was tested and"}, {"category_id": 15, "poly": [777.0, 1780.0, 1381.0, 1780.0, 1381.0, 1817.0, 777.0, 1817.0], "score": 0.99, "text": "applied to 10 afforested catchments. We successfully"}, {"category_id": 15, "poly": [778.0, 1810.0, 1385.0, 1815.0, 1385.0, 1851.0, 777.0, 1847.0], "score": 1.0, "text": "fitted our model to catchments with varying spatial"}, {"category_id": 15, "poly": [1342.0, 189.0, 1387.0, 189.0, 1387.0, 234.0, 1342.0, 234.0], "score": 1.0, "text": "263"}], "page_info": {"page_no": 10, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 0, "poly": [132.2023162841797, 944.2422485351562, 352.39361572265625, 944.2422485351562, 352.39361572265625, 973.2088623046875, 132.2023162841797, 973.2088623046875], "score": 0.9999986886978149}, {"category_id": 2, "poly": [480.25787353515625, 196.37384033203125, 1044.1737060546875, 196.37384033203125, 1044.1737060546875, 218.59146118164062, 480.25787353515625, 218.59146118164062], "score": 0.9999973177909851}, {"category_id": 1, "poly": [131.0494842529297, 255.921875, 730.36865234375, 255.921875, 730.36865234375, 848.8026123046875, 131.0494842529297, 848.8026123046875], "score": 0.9999949336051941}, {"category_id": 1, "poly": [130.6344757080078, 1010.9166870117188, 730.247314453125, 1010.9166870117188, 730.247314453125, 1437.8150634765625, 130.6344757080078, 1437.8150634765625], "score": 0.9999939203262329}, {"category_id": 2, "poly": [130.68336486816406, 196.22927856445312, 164.99691772460938, 196.22927856445312, 164.99691772460938, 215.17306518554688, 130.68336486816406, 215.17306518554688], "score": 0.9999921321868896}, {"category_id": 1, "poly": [131.60971069335938, 1597.386962890625, 732.5863647460938, 1597.386962890625, 732.5863647460938, 1846.581787109375, 131.60971069335938, 1846.581787109375], "score": 0.9999908208847046}, {"category_id": 1, "poly": [791.6022338867188, 251.6699676513672, 1397.9674072265625, 251.6699676513672, 1397.9674072265625, 1848.8499755859375, 791.6022338867188, 1848.8499755859375], "score": 0.9999874830245972}, {"category_id": 0, "poly": [131.00613403320312, 1534.647705078125, 256.1180725097656, 1534.647705078125, 256.1180725097656, 1561.1875, 131.00613403320312, 1561.1875], "score": 0.9999844431877136}, {"category_id": 13, "poly": [1067, 1022, 1120, 1022, 1120, 1049, 1067, 1049], "score": 0.57, "latex": "219{\\mathrm{~p~}}"}, {"category_id": 15, "poly": [129.0, 939.0, 357.0, 944.0, 356.0, 983.0, 129.0, 978.0], "score": 1.0, "text": "Acknowledgements"}, {"category_id": 15, "poly": [481.0, 194.0, 1046.0, 194.0, 1046.0, 224.0, 481.0, 224.0], "score": 0.97, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [127.0, 252.0, 734.0, 249.0, 735.0, 286.0, 127.0, 288.0], "score": 0.99, "text": "scales, species and environments, and have shown that it"}, {"category_id": 15, "poly": [129.0, 290.0, 732.0, 290.0, 732.0, 320.0, 129.0, 320.0], "score": 0.99, "text": "provides a means of separating the influence of climate"}, {"category_id": 15, "poly": [129.0, 322.0, 732.0, 322.0, 732.0, 353.0, 129.0, 353.0], "score": 0.99, "text": "and vegetation on the FDCs. The modelled results"}, {"category_id": 15, "poly": [129.0, 353.0, 730.0, 353.0, 730.0, 383.0, 129.0, 383.0], "score": 0.97, "text": "showed the greatest proportional impacts were for"}, {"category_id": 15, "poly": [129.0, 387.0, 732.0, 387.0, 732.0, 417.0, 129.0, 417.0], "score": 0.99, "text": "median and lower flows. The flow reductions from the"}, {"category_id": 15, "poly": [129.0, 421.0, 732.0, 421.0, 732.0, 449.0, 129.0, 449.0], "score": 0.98, "text": "three small catchments SE Australian were the highest"}, {"category_id": 15, "poly": [129.0, 456.0, 735.0, 456.0, 735.0, 486.0, 129.0, 486.0], "score": 0.99, "text": "and may reflect lower storages. The characterisation of"}, {"category_id": 15, "poly": [129.0, 488.0, 732.0, 488.0, 732.0, 518.0, 129.0, 518.0], "score": 0.99, "text": "the number of zero flow days was also successful for"}, {"category_id": 15, "poly": [129.0, 522.0, 735.0, 522.0, 735.0, 553.0, 129.0, 553.0], "score": 0.98, "text": "these catchments in indicating a significant increase in"}, {"category_id": 15, "poly": [129.0, 555.0, 730.0, 555.0, 730.0, 585.0, 129.0, 585.0], "score": 0.99, "text": "zero flows. The flow reductions identified here probably"}, {"category_id": 15, "poly": [127.0, 589.0, 732.0, 589.0, 732.0, 619.0, 127.0, 619.0], "score": 0.97, "text": "represent a maximum effect given the size of the"}, {"category_id": 15, "poly": [129.0, 619.0, 728.0, 619.0, 728.0, 649.0, 129.0, 649.0], "score": 0.99, "text": "catchments, level of afforestation and the shallow soils."}, {"category_id": 15, "poly": [129.0, 654.0, 732.0, 654.0, 732.0, 684.0, 129.0, 684.0], "score": 0.98, "text": "These results have yielded useful new insights on the"}, {"category_id": 15, "poly": [129.0, 688.0, 735.0, 688.0, 735.0, 718.0, 129.0, 718.0], "score": 0.99, "text": "contentious issue of the hydrological impact of"}, {"category_id": 15, "poly": [129.0, 720.0, 732.0, 720.0, 732.0, 750.0, 129.0, 750.0], "score": 0.99, "text": "afforestation. This research has led to the development"}, {"category_id": 15, "poly": [129.0, 755.0, 732.0, 755.0, 732.0, 785.0, 129.0, 785.0], "score": 0.99, "text": "of a method to assess the net impact of afforestation on"}, {"category_id": 15, "poly": [129.0, 787.0, 730.0, 787.0, 730.0, 817.0, 129.0, 817.0], "score": 0.99, "text": "the fow duration curve which does not require paired-"}, {"category_id": 15, "poly": [127.0, 819.0, 591.0, 821.0, 591.0, 851.0, 127.0, 849.0], "score": 0.98, "text": "catchments to remove climatic variability."}, {"category_id": 15, "poly": [163.0, 1010.0, 730.0, 1010.0, 730.0, 1041.0, 163.0, 1041.0], "score": 0.98, "text": "The authors would like to thank Rory Nathan,"}, {"category_id": 15, "poly": [127.0, 1041.0, 735.0, 1041.0, 735.0, 1077.0, 127.0, 1077.0], "score": 1.0, "text": "Narendra Tuteja, Tom McMahon, Geoff Podger, Rob"}, {"category_id": 15, "poly": [131.0, 1077.0, 728.0, 1077.0, 728.0, 1107.0, 131.0, 1107.0], "score": 0.99, "text": "Vertessy, Glen Walker and Peter Hairsine for particu-"}, {"category_id": 15, "poly": [129.0, 1109.0, 732.0, 1109.0, 732.0, 1140.0, 129.0, 1140.0], "score": 0.99, "text": "larly helpful discussions on methodologies and reviews,"}, {"category_id": 15, "poly": [127.0, 1142.0, 732.0, 1144.0, 732.0, 1174.0, 127.0, 1172.0], "score": 1.0, "text": "Richard Morton for valuable statistical advice, Dave"}, {"category_id": 15, "poly": [129.0, 1178.0, 730.0, 1178.0, 730.0, 1208.0, 129.0, 1208.0], "score": 1.0, "text": "Scott for supplying the South African data, Barry Fahey"}, {"category_id": 15, "poly": [127.0, 1208.0, 732.0, 1210.0, 732.0, 1241.0, 127.0, 1238.0], "score": 0.99, "text": "for the New Zealand data, and Hancocks Victorian"}, {"category_id": 15, "poly": [127.0, 1241.0, 730.0, 1243.0, 730.0, 1273.0, 127.0, 1271.0], "score": 0.99, "text": "Plantations for vegetation data. The study was funded by"}, {"category_id": 15, "poly": [129.0, 1277.0, 735.0, 1277.0, 735.0, 1307.0, 129.0, 1307.0], "score": 0.97, "text": "the Victorian Department of Natural Resources and"}, {"category_id": 15, "poly": [129.0, 1312.0, 732.0, 1312.0, 732.0, 1339.0, 129.0, 1339.0], "score": 0.98, "text": "Environment Private Forestry Unit, the CRC for"}, {"category_id": 15, "poly": [129.0, 1344.0, 735.0, 1344.0, 735.0, 1374.0, 129.0, 1374.0], "score": 1.0, "text": "Catchment Hydrology, and the MDBC funded project"}, {"category_id": 15, "poly": [127.0, 1372.0, 735.0, 1374.0, 734.0, 1410.0, 127.0, 1408.0], "score": 0.99, "text": "\u201cIntegrated assessment of the effects of land use changes"}, {"category_id": 15, "poly": [127.0, 1410.0, 558.0, 1408.0, 558.0, 1438.0, 127.0, 1441.0], "score": 0.98, "text": "on water yield and salt loads\u2019 (D2013)."}, {"category_id": 15, "poly": [127.0, 189.0, 170.0, 189.0, 170.0, 228.0, 127.0, 228.0], "score": 1.0, "text": "264"}, {"category_id": 15, "poly": [127.0, 1593.0, 732.0, 1598.0, 732.0, 1632.0, 127.0, 1627.0], "score": 0.97, "text": "Bosch, J.M., 1979. Treatment effects on annual and dry period"}, {"category_id": 15, "poly": [157.0, 1625.0, 732.0, 1623.0, 732.0, 1653.0, 157.0, 1656.0], "score": 0.99, "text": " streamflow at Cathedral Peak. South African Forestry Journal 108,"}, {"category_id": 15, "poly": [161.0, 1651.0, 230.0, 1651.0, 230.0, 1681.0, 161.0, 1681.0], "score": 0.99, "text": "29-37."}, {"category_id": 15, "poly": [127.0, 1681.0, 732.0, 1681.0, 732.0, 1709.0, 127.0, 1709.0], "score": 0.97, "text": "Bosch, J.M., Von Gadow, K., 1990. Regulating afforestation for water"}, {"category_id": 15, "poly": [159.0, 1709.0, 730.0, 1707.0, 730.0, 1737.0, 159.0, 1739.0], "score": 0.98, "text": " conservation in South Africa. Suid-Afrikaanse Bosboutydskrif 153,"}, {"category_id": 15, "poly": [163.0, 1739.0, 228.0, 1739.0, 228.0, 1763.0, 163.0, 1763.0], "score": 1.0, "text": "41-54."}, {"category_id": 15, "poly": [127.0, 1763.0, 735.0, 1765.0, 734.0, 1795.0, 127.0, 1793.0], "score": 0.97, "text": "Chiew, F.H.S., McMahon, T.A., 1993. Assessing the adequacy of"}, {"category_id": 15, "poly": [161.0, 1793.0, 735.0, 1793.0, 735.0, 1821.0, 161.0, 1821.0], "score": 0.97, "text": "catchment streamflow yield estimates. Australian Journal of Soil"}, {"category_id": 15, "poly": [163.0, 1819.0, 365.0, 1819.0, 365.0, 1847.0, 163.0, 1847.0], "score": 1.0, "text": "Research 31, 665-680."}, {"category_id": 15, "poly": [791.0, 251.0, 1398.0, 256.0, 1398.0, 286.0, 790.0, 282.0], "score": 0.99, "text": "Cornish, P.M., Vertessy, R.A., 2001. Forest age-induced changes in"}, {"category_id": 15, "poly": [823.0, 284.0, 1398.0, 282.0, 1398.0, 312.0, 823.0, 314.0], "score": 0.98, "text": " evapotranspiration and water yield in a eucalypt forest. Journal of"}, {"category_id": 15, "poly": [825.0, 312.0, 1033.0, 312.0, 1033.0, 342.0, 825.0, 342.0], "score": 1.0, "text": "Hydrology 242, 43-63."}, {"category_id": 15, "poly": [788.0, 338.0, 1398.0, 340.0, 1398.0, 370.0, 788.0, 368.0], "score": 1.0, "text": "Fahey, B., Jackson, R., 1997. Hydrological impacts of converting"}, {"category_id": 15, "poly": [820.0, 366.0, 1398.0, 363.0, 1398.0, 400.0, 821.0, 402.0], "score": 0.97, "text": "native forests and grasslands to pine plantations, South"}, {"category_id": 15, "poly": [821.0, 393.0, 1396.0, 396.0, 1396.0, 428.0, 820.0, 426.0], "score": 0.98, "text": " Island, New Zealand. Agricultural and Forest Meteorology 84,"}, {"category_id": 15, "poly": [825.0, 424.0, 889.0, 424.0, 889.0, 454.0, 825.0, 454.0], "score": 1.0, "text": "69-82."}, {"category_id": 15, "poly": [788.0, 451.0, 1396.0, 454.0, 1396.0, 484.0, 788.0, 482.0], "score": 0.99, "text": "Hickel, K., 2001. The effect of pine afforestation on flow regime in"}, {"category_id": 15, "poly": [823.0, 479.0, 1398.0, 484.0, 1398.0, 512.0, 822.0, 507.0], "score": 0.97, "text": "small upland catchments. Masters Thesis, University of Stuttgart,"}, {"category_id": 15, "poly": [820.0, 510.0, 889.0, 505.0, 892.0, 537.0, 822.0, 543.0], "score": 0.94, "text": "p. 134."}, {"category_id": 15, "poly": [790.0, 540.0, 1396.0, 540.0, 1396.0, 568.0, 790.0, 568.0], "score": 1.0, "text": "Holmes, J.W., Sinclair, J.A., 1986. Water yield from some afforested"}, {"category_id": 15, "poly": [825.0, 570.0, 1398.0, 570.0, 1398.0, 598.0, 825.0, 598.0], "score": 0.98, "text": "catchments in Victoria. In Hydrology and Water Resources"}, {"category_id": 15, "poly": [825.0, 596.0, 1398.0, 596.0, 1398.0, 626.0, 825.0, 626.0], "score": 0.99, "text": "Symposium, Griffth University, Brisbane 25-27 November 1986,"}, {"category_id": 15, "poly": [820.0, 626.0, 939.0, 619.0, 941.0, 649.0, 822.0, 656.0], "score": 0.95, "text": "pp. 214-218."}, {"category_id": 15, "poly": [790.0, 654.0, 1398.0, 654.0, 1398.0, 682.0, 790.0, 682.0], "score": 0.99, "text": "Lane, P.N.J., Best, A.E., Hickel, K., Zhang, L., 2003. The effect"}, {"category_id": 15, "poly": [825.0, 682.0, 1398.0, 682.0, 1398.0, 710.0, 825.0, 710.0], "score": 0.99, "text": "of afforestation on flow duration curves. Cooperative Research"}, {"category_id": 15, "poly": [825.0, 710.0, 1396.0, 710.0, 1396.0, 740.0, 825.0, 740.0], "score": 0.97, "text": "Centre for Catchment Hydrology Technical Report O3/13,"}, {"category_id": 15, "poly": [820.0, 745.0, 884.0, 739.0, 886.0, 763.0, 822.0, 768.0], "score": 0.96, "text": "p.25."}, {"category_id": 15, "poly": [790.0, 768.0, 1396.0, 768.0, 1396.0, 798.0, 790.0, 798.0], "score": 0.98, "text": "Legates, D.R., McCabe, G.J., 1999. Evaluating the use of 'goodness-"}, {"category_id": 15, "poly": [825.0, 796.0, 1396.0, 796.0, 1396.0, 823.0, 825.0, 823.0], "score": 0.98, "text": "of-fit\u2019 measures in hydrologic and hydroclimatic model validation."}, {"category_id": 15, "poly": [825.0, 823.0, 1181.0, 823.0, 1181.0, 851.0, 825.0, 851.0], "score": 1.0, "text": "Water Resources Research 35, 233-241."}, {"category_id": 15, "poly": [790.0, 851.0, 1398.0, 851.0, 1398.0, 882.0, 790.0, 882.0], "score": 0.98, "text": "Lyne, V.D., Hollick, M., 1979. Stochastic time-varying rainfall-runoff"}, {"category_id": 15, "poly": [825.0, 882.0, 1398.0, 882.0, 1398.0, 912.0, 825.0, 912.0], "score": 1.0, "text": "modelling. Hydrology and Water Resources Symposium, Perth."}, {"category_id": 15, "poly": [825.0, 909.0, 1224.0, 909.0, 1224.0, 940.0, 825.0, 940.0], "score": 0.98, "text": "Institution of Engineers, Australia, pp. 89-92."}, {"category_id": 15, "poly": [788.0, 935.0, 1398.0, 937.0, 1398.0, 968.0, 788.0, 965.0], "score": 0.98, "text": "Nandakumar, N., Mein, R.G., 1993. Analysis of paired catchment data"}, {"category_id": 15, "poly": [825.0, 965.0, 1398.0, 965.0, 1398.0, 995.0, 825.0, 995.0], "score": 1.0, "text": "to determine the hydrologic effects of changes in vegetative cover"}, {"category_id": 15, "poly": [827.0, 995.0, 1396.0, 995.0, 1396.0, 1026.0, 827.0, 1026.0], "score": 0.99, "text": "on yield. Technical Report for Project UM010, Monash University"}, {"category_id": 15, "poly": [788.0, 1049.0, 1398.0, 1051.0, 1398.0, 1081.0, 788.0, 1079.0], "score": 0.98, "text": "Nash, J.E., Sutcliffe, J.V., 1970. River fow forecasting through"}, {"category_id": 15, "poly": [825.0, 1079.0, 1400.0, 1079.0, 1400.0, 1109.0, 825.0, 1109.0], "score": 0.97, "text": "conceptual models, I, A discussion of principals. Journal of"}, {"category_id": 15, "poly": [825.0, 1109.0, 1042.0, 1109.0, 1042.0, 1137.0, 825.0, 1137.0], "score": 1.0, "text": "Hydrology 10, 282-290."}, {"category_id": 15, "poly": [790.0, 1137.0, 1398.0, 1137.0, 1398.0, 1165.0, 790.0, 1165.0], "score": 0.98, "text": "Putahena, W.M., Cordery, I., 2000. Some hydrological effects of"}, {"category_id": 15, "poly": [827.0, 1165.0, 1398.0, 1165.0, 1398.0, 1195.0, 827.0, 1195.0], "score": 0.99, "text": "changing forest cover from eucalyptus to Pinus radiata. Agricul-"}, {"category_id": 15, "poly": [825.0, 1193.0, 1192.0, 1193.0, 1192.0, 1223.0, 825.0, 1223.0], "score": 0.99, "text": "tural and Forest Meteorology 100, 59-72."}, {"category_id": 15, "poly": [790.0, 1223.0, 1398.0, 1223.0, 1398.0, 1253.0, 790.0, 1253.0], "score": 0.99, "text": "Roberts, S., Vertessy, R.A., Grayson, R.G., 2001. Transpiration from"}, {"category_id": 15, "poly": [825.0, 1251.0, 1400.0, 1251.0, 1400.0, 1281.0, 825.0, 1281.0], "score": 0.99, "text": "Eucalyptus sieberi (L. Johnson) forests of different age. Forest "}, {"category_id": 15, "poly": [825.0, 1279.0, 1183.0, 1279.0, 1183.0, 1309.0, 825.0, 1309.0], "score": 1.0, "text": "Ecology and Management 143, 153-161."}, {"category_id": 15, "poly": [788.0, 1305.0, 1398.0, 1307.0, 1398.0, 1337.0, 788.0, 1335.0], "score": 0.99, "text": "Scott, D.F., Smith, R.E., 1997. Preliminary empirical models to predict"}, {"category_id": 15, "poly": [823.0, 1333.0, 1398.0, 1335.0, 1398.0, 1365.0, 823.0, 1363.0], "score": 0.98, "text": "reductions in total and low flows resulting from afforestation."}, {"category_id": 15, "poly": [825.0, 1363.0, 1046.0, 1363.0, 1046.0, 1393.0, 825.0, 1393.0], "score": 0.99, "text": "Water S.A. 23, 135-140."}, {"category_id": 15, "poly": [790.0, 1393.0, 1398.0, 1393.0, 1398.0, 1421.0, 790.0, 1421.0], "score": 0.97, "text": "Scott, D.F., Prinsloo, F.W., Moses, G., Mehlomakulu, M.,"}, {"category_id": 15, "poly": [825.0, 1421.0, 1398.0, 1421.0, 1398.0, 1449.0, 825.0, 1449.0], "score": 0.97, "text": "Simmers, A.D.A., 2000. Area-analysis of the South African"}, {"category_id": 15, "poly": [825.0, 1449.0, 1398.0, 1449.0, 1398.0, 1479.0, 825.0, 1479.0], "score": 0.96, "text": "catchment afforestation experimental data. WRC Report"}, {"category_id": 15, "poly": [825.0, 1481.0, 954.0, 1481.0, 954.0, 1505.0, 825.0, 1505.0], "score": 0.98, "text": "No. 810/1/00."}, {"category_id": 15, "poly": [790.0, 1507.0, 1396.0, 1507.0, 1396.0, 1535.0, 790.0, 1535.0], "score": 0.98, "text": "Sikka, A.K., Samra, JS., Sharda, V.N., Samraj, P., Lakshmanan, V.,"}, {"category_id": 15, "poly": [825.0, 1535.0, 1400.0, 1535.0, 1400.0, 1565.0, 825.0, 1565.0], "score": 0.98, "text": "2003. Low fow and high responses to converting natural grassland"}, {"category_id": 15, "poly": [827.0, 1561.0, 1400.0, 1561.0, 1400.0, 1591.0, 827.0, 1591.0], "score": 0.99, "text": "into bluegum (Eucalyptus globulus) in Ningiris watersheds of"}, {"category_id": 15, "poly": [825.0, 1591.0, 1235.0, 1591.0, 1235.0, 1621.0, 825.0, 1621.0], "score": 0.99, "text": "South India. Journal of Hydrology 270, 12-26."}, {"category_id": 15, "poly": [790.0, 1621.0, 1398.0, 1621.0, 1398.0, 1651.0, 790.0, 1651.0], "score": 0.98, "text": " Smakhtin, V.U., 1999. A concept of pragmatic hydrological time series "}, {"category_id": 15, "poly": [825.0, 1649.0, 1398.0, 1649.0, 1398.0, 1679.0, 825.0, 1679.0], "score": 0.99, "text": "modelling and its application in South African context. In Ninth"}, {"category_id": 15, "poly": [823.0, 1675.0, 1398.0, 1677.0, 1398.0, 1707.0, 823.0, 1705.0], "score": 0.98, "text": " South African National Hydrology Symposium, 29-30 November"}, {"category_id": 15, "poly": [825.0, 1703.0, 971.0, 1703.0, 971.0, 1739.0, 825.0, 1739.0], "score": 0.99, "text": "1999, pp. 1-11."}, {"category_id": 15, "poly": [790.0, 1735.0, 1398.0, 1735.0, 1398.0, 1765.0, 790.0, 1765.0], "score": 0.98, "text": " Smakhtin, V.U., 2001. Low flow hydrology: a review. Journal of"}, {"category_id": 15, "poly": [825.0, 1763.0, 1052.0, 1763.0, 1052.0, 1793.0, 825.0, 1793.0], "score": 1.0, "text": "Hydrology 240, 147-186."}, {"category_id": 15, "poly": [793.0, 1791.0, 1398.0, 1791.0, 1398.0, 1821.0, 793.0, 1821.0], "score": 0.99, "text": "Van Lill, W.S., Kruger, F.J., Van Wyk, D.B., 1980. The effect of"}, {"category_id": 15, "poly": [827.0, 1819.0, 1398.0, 1819.0, 1398.0, 1849.0, 827.0, 1849.0], "score": 0.98, "text": "afforestation with Eucalyptus grandis Hill ex Maiden and Pinus"}, {"category_id": 15, "poly": [823.0, 1021.0, 1066.0, 1023.0, 1066.0, 1054.0, 822.0, 1051.0], "score": 0.98, "text": " Dept. of Civil Engineering,"}, {"category_id": 15, "poly": [126.0, 1528.0, 260.0, 1533.0, 259.0, 1572.0, 124.0, 1567.0], "score": 1.0, "text": "References"}], "page_info": {"page_no": 11, "height": 2064, "width": 1512}}, {"layout_dets": [{"category_id": 2, "poly": [465.48040771484375, 195.6739959716797, 1032.2401123046875, 195.6739959716797, 1032.2401123046875, 218.9838104248047, 465.48040771484375, 218.9838104248047], "score": 0.9999986886978149}, {"category_id": 1, "poly": [776.9209594726562, 255.59912109375, 1385.6478271484375, 255.59912109375, 1385.6478271484375, 614.4959716796875, 776.9209594726562, 614.4959716796875], "score": 0.9999933242797852}, {"category_id": 2, "poly": [1346.0157470703125, 195.03271484375, 1382.0159912109375, 195.03271484375, 1382.0159912109375, 217.2877960205078, 1346.0157470703125, 217.2877960205078], "score": 0.9999925494194031}, {"category_id": 1, "poly": [116.54571533203125, 257.5740966796875, 716.8768920898438, 257.5740966796875, 716.8768920898438, 615.0397338867188, 116.54571533203125, 615.0397338867188], "score": 0.9999920725822449}, {"category_id": 15, "poly": [466.0, 194.0, 1033.0, 194.0, 1033.0, 224.0, 466.0, 224.0], "score": 0.99, "text": "P.N.J. Lane et al. / Journal of Hydrology 310 (2005) 253-265"}, {"category_id": 15, "poly": [780.0, 254.0, 1383.0, 254.0, 1383.0, 284.0, 780.0, 284.0], "score": 0.99, "text": "Vogel, R.M., Fennessey, N.M., 1994. Flow duration curves. 1. New"}, {"category_id": 15, "poly": [810.0, 279.0, 1385.0, 282.0, 1385.0, 312.0, 810.0, 310.0], "score": 0.98, "text": " interpretation and confidence intervals. Journal of Water Planning"}, {"category_id": 15, "poly": [814.0, 310.0, 1128.0, 310.0, 1128.0, 340.0, 814.0, 340.0], "score": 0.99, "text": "and Management 120 (4), 485-504."}, {"category_id": 15, "poly": [780.0, 338.0, 1387.0, 338.0, 1387.0, 366.0, 780.0, 366.0], "score": 0.98, "text": "Whitehead, D., Beadle C.L., 2004. Physiological regulation of"}, {"category_id": 15, "poly": [808.0, 361.0, 1387.0, 363.0, 1387.0, 400.0, 807.0, 398.0], "score": 0.98, "text": " productivity and water use in Eucalyptus: a review. Forest Ecology"}, {"category_id": 15, "poly": [812.0, 393.0, 1104.0, 393.0, 1104.0, 424.0, 812.0, 424.0], "score": 1.0, "text": "and Management, 193, 113-140."}, {"category_id": 15, "poly": [777.0, 421.0, 1385.0, 421.0, 1385.0, 449.0, 777.0, 449.0], "score": 0.98, "text": "Zhang, L., Dawes, W.R., Walker, G.R., 1999. Predicting the effect of"}, {"category_id": 15, "poly": [814.0, 449.0, 1383.0, 449.0, 1383.0, 479.0, 814.0, 479.0], "score": 0.99, "text": "vegetation changes on catchment average water balance. Coop-"}, {"category_id": 15, "poly": [812.0, 475.0, 1385.0, 475.0, 1385.0, 505.0, 812.0, 505.0], "score": 0.99, "text": "erative Research Centre for Catchment Hydrology Technical"}, {"category_id": 15, "poly": [810.0, 503.0, 994.0, 503.0, 994.0, 533.0, 810.0, 533.0], "score": 0.99, "text": "Report 99/12, p. 35."}, {"category_id": 15, "poly": [777.0, 531.0, 1385.0, 531.0, 1385.0, 561.0, 777.0, 561.0], "score": 0.97, "text": "Zhang, L., Dawes, W.R., Walker, G.R., 2001. Response of mean"}, {"category_id": 15, "poly": [810.0, 557.0, 1385.0, 559.0, 1385.0, 589.0, 810.0, 587.0], "score": 0.98, "text": " annual evapotranspiration to vegetation changes at catchment"}, {"category_id": 15, "poly": [812.0, 587.0, 1222.0, 587.0, 1222.0, 615.0, 812.0, 615.0], "score": 1.0, "text": "scale. Water Resources Research 37, 701-708."}, {"category_id": 15, "poly": [1342.0, 189.0, 1387.0, 189.0, 1387.0, 234.0, 1342.0, 234.0], "score": 1.0, "text": "265"}, {"category_id": 15, "poly": [148.0, 254.0, 719.0, 254.0, 719.0, 284.0, 148.0, 284.0], "score": 1.0, "text": "patula Schlect. et Cham. on streamflow from experimental"}, {"category_id": 15, "poly": [146.0, 279.0, 720.0, 282.0, 719.0, 312.0, 146.0, 310.0], "score": 0.99, "text": "catchments at Mokubulaan, Transval. Journal of Hydrology 48,"}, {"category_id": 15, "poly": [150.0, 312.0, 234.0, 312.0, 234.0, 335.0, 150.0, 335.0], "score": 1.0, "text": "107-118."}, {"category_id": 15, "poly": [114.0, 338.0, 719.0, 338.0, 719.0, 366.0, 114.0, 366.0], "score": 0.97, "text": "Van Wyk, D.B., 1987. Some effects of afforestation on streamflow"}, {"category_id": 15, "poly": [144.0, 366.0, 719.0, 366.0, 719.0, 396.0, 144.0, 396.0], "score": 0.98, "text": "in the Western Cape Province, South Africa. Water S.A. 13,"}, {"category_id": 15, "poly": [148.0, 396.0, 210.0, 396.0, 210.0, 419.0, 148.0, 419.0], "score": 1.0, "text": "31-36."}, {"category_id": 15, "poly": [114.0, 421.0, 719.0, 421.0, 719.0, 452.0, 114.0, 452.0], "score": 0.98, "text": "Vertessy, R.A., Bessard, Y., 1999. Anticipating the negative"}, {"category_id": 15, "poly": [146.0, 449.0, 722.0, 449.0, 722.0, 479.0, 146.0, 479.0], "score": 0.98, "text": "hydrologic effects of plantation expansion: results from a"}, {"category_id": 15, "poly": [148.0, 475.0, 717.0, 475.0, 717.0, 503.0, 148.0, 503.0], "score": 0.98, "text": "GIS-based analysis on the Murrumbidgee Basin, in: Croke, J.,"}, {"category_id": 15, "poly": [146.0, 503.0, 722.0, 503.0, 722.0, 533.0, 146.0, 533.0], "score": 0.99, "text": "Lane, P.N.J. (Eds.), Forest Management for Water Quality and"}, {"category_id": 15, "poly": [144.0, 527.0, 722.0, 529.0, 722.0, 565.0, 144.0, 563.0], "score": 0.99, "text": "Quantity: Proceedings of the 2nd Erosion in Forests Meeting"}, {"category_id": 15, "poly": [146.0, 557.0, 722.0, 559.0, 722.0, 589.0, 146.0, 587.0], "score": 0.97, "text": " Cooperative Research Centre for Catchment Hydrology, Report "}, {"category_id": 15, "poly": [146.0, 587.0, 301.0, 587.0, 301.0, 617.0, 146.0, 617.0], "score": 0.93, "text": "99/6, Pp. 69-73."}], "page_info": {"page_no": 12, "height": 2064, "width": 1512}}]
\ No newline at end of file
diff --git a/demo/demo1.pdf b/demo/demo1.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c9405d621e47da5bef2e4c685e8713095bbd4237
Binary files /dev/null and b/demo/demo1.pdf differ
diff --git a/demo/demo2.json b/demo/demo2.json
new file mode 100644
index 0000000000000000000000000000000000000000..d632d70fafe0b56404f1508f66c7835988ed6ff3
--- /dev/null
+++ b/demo/demo2.json
@@ -0,0 +1 @@
+[{"layout_dets": [{"category_id": 0, "poly": [282.1632080078125, 156.2249755859375, 1416.6795654296875, 156.2249755859375, 1416.6795654296875, 313.81280517578125, 282.1632080078125, 313.81280517578125], "score": 0.999998927116394}, {"category_id": 1, "poly": [861.656982421875, 522.7763061523438, 1569.3853759765625, 522.7763061523438, 1569.3853759765625, 656.883544921875, 861.656982421875, 656.883544921875], "score": 0.9999970197677612}, {"category_id": 1, "poly": [131.8020782470703, 924.7362670898438, 838.9530639648438, 924.7362670898438, 838.9530639648438, 1323.7529296875, 131.8020782470703, 1323.7529296875], "score": 0.9999949932098389}, {"category_id": 1, "poly": [133.32005310058594, 1324.5035400390625, 839.2289428710938, 1324.5035400390625, 839.2289428710938, 1589.4503173828125, 133.32005310058594, 1589.4503173828125], "score": 0.999994158744812}, {"category_id": 1, "poly": [863.3811645507812, 1486.610107421875, 1569.2880859375, 1486.610107421875, 1569.2880859375, 1852.443603515625, 863.3811645507812, 1852.443603515625], "score": 0.9999936819076538}, {"category_id": 1, "poly": [862.9096069335938, 1187.8067626953125, 1568.2279052734375, 1187.8067626953125, 1568.2279052734375, 1486.08935546875, 862.9096069335938, 1486.08935546875], "score": 0.9999932050704956}, {"category_id": 1, "poly": [131.8186492919922, 1652.7752685546875, 837.5543823242188, 1652.7752685546875, 837.5543823242188, 2019.429443359375, 131.8186492919922, 2019.429443359375], "score": 0.9999901056289673}, {"category_id": 0, "poly": [375.1526794433594, 881.8807983398438, 594.3075561523438, 881.8807983398438, 594.3075561523438, 913.4786987304688, 375.1526794433594, 913.4786987304688], "score": 0.9999892115592957}, {"category_id": 2, "poly": [636.1867065429688, 2099.795654296875, 1063.7423095703125, 2099.795654296875, 1063.7423095703125, 2124.524169921875, 636.1867065429688, 2124.524169921875], "score": 0.9999860525131226}, {"category_id": 0, "poly": [375.91864013671875, 1610.209228515625, 592.8395385742188, 1610.209228515625, 592.8395385742188, 1641.5789794921875, 375.91864013671875, 1641.5789794921875], "score": 0.9999815821647644}, {"category_id": 4, "poly": [860.6583251953125, 995.6574096679688, 1569.622314453125, 995.6574096679688, 1569.622314453125, 1126.8409423828125, 860.6583251953125, 1126.8409423828125], "score": 0.9999815821647644}, {"category_id": 1, "poly": [443.1008605957031, 353.8008728027344, 1250.531494140625, 353.8008728027344, 1250.531494140625, 464.65576171875, 443.1008605957031, 464.65576171875], "score": 0.9999791979789734}, {"category_id": 1, "poly": [130.8282928466797, 523.2079467773438, 836.5639038085938, 523.2079467773438, 836.5639038085938, 862.0206909179688, 130.8282928466797, 862.0206909179688], "score": 0.9999784231185913}, {"category_id": 1, "poly": [862.6514282226562, 1851.426513671875, 1568.510498046875, 1851.426513671875, 1568.510498046875, 2017.93359375, 862.6514282226562, 2017.93359375], "score": 0.9999769926071167}, {"category_id": 3, "poly": [882.3795166015625, 685.376708984375, 1544.4088134765625, 685.376708984375, 1544.4088134765625, 969.22265625, 882.3795166015625, 969.22265625], "score": 0.9994785785675049}, {"category_id": 13, "poly": [1195, 1062, 1226, 1062, 1226, 1096, 1195, 1096], "score": 0.88, "latex": "d_{p}"}, {"category_id": 13, "poly": [1304, 1030, 1327, 1030, 1327, 1061, 1304, 1061], "score": 0.65, "latex": "\\bar{\\bf p}"}, {"category_id": 15, "poly": [344.0, 165.0, 1354.0, 172.0, 1353.0, 236.0, 344.0, 229.0], "score": 0.99, "text": "Real-time Temporal Stereo Matching"}, {"category_id": 15, "poly": [293.0, 254.0, 1402.0, 254.0, 1402.0, 309.0, 293.0, 309.0], "score": 0.99, "text": "using Iterative Adaptive Support Weights"}, {"category_id": 15, "poly": [864.0, 527.0, 1568.0, 527.0, 1568.0, 559.0, 864.0, 559.0], "score": 0.99, "text": "disparity map. Note that individual disparities can be converted"}, {"category_id": 15, "poly": [864.0, 561.0, 1568.0, 561.0, 1568.0, 594.0, 864.0, 594.0], "score": 0.98, "text": "to actual depths if the geometry of the camera setup is"}, {"category_id": 15, "poly": [859.0, 587.0, 1568.0, 591.0, 1568.0, 630.0, 859.0, 626.0], "score": 0.98, "text": " known, i.e., the stereo configuration of cameras has been pre-"}, {"category_id": 15, "poly": [862.0, 626.0, 984.0, 626.0, 984.0, 658.0, 862.0, 658.0], "score": 1.0, "text": "calibrated."}, {"category_id": 15, "poly": [155.0, 921.0, 839.0, 924.0, 838.0, 963.0, 155.0, 960.0], "score": 0.98, "text": " Modern stereo matching algorithms achieve excellent results"}, {"category_id": 15, "poly": [127.0, 956.0, 838.0, 958.0, 838.0, 997.0, 127.0, 995.0], "score": 0.98, "text": " on static stereo images, as demonstrated by the Middlebury"}, {"category_id": 15, "poly": [132.0, 995.0, 836.0, 995.0, 836.0, 1027.0, 132.0, 1027.0], "score": 0.98, "text": "stereo performance benchmark [1], [2]. However, their ap-"}, {"category_id": 15, "poly": [134.0, 1027.0, 834.0, 1027.0, 834.0, 1059.0, 134.0, 1059.0], "score": 1.0, "text": "plication to stereo video sequences does not guarantee inter-"}, {"category_id": 15, "poly": [134.0, 1061.0, 836.0, 1061.0, 836.0, 1093.0, 134.0, 1093.0], "score": 0.99, "text": "frame consistency of matches extracted from subsequent stereo"}, {"category_id": 15, "poly": [132.0, 1095.0, 838.0, 1095.0, 838.0, 1125.0, 132.0, 1125.0], "score": 0.99, "text": "frame pairs. The lack of temporal consistency of matches"}, {"category_id": 15, "poly": [134.0, 1128.0, 836.0, 1128.0, 836.0, 1157.0, 134.0, 1157.0], "score": 1.0, "text": "between successive frames introduces spurious artifacts in the"}, {"category_id": 15, "poly": [132.0, 1160.0, 836.0, 1160.0, 836.0, 1192.0, 132.0, 1192.0], "score": 0.99, "text": "resulting disparity maps. The problem of obtaining temporally"}, {"category_id": 15, "poly": [132.0, 1194.0, 838.0, 1194.0, 838.0, 1226.0, 132.0, 1226.0], "score": 0.98, "text": "consistent sequences of disparity maps from video streams is"}, {"category_id": 15, "poly": [134.0, 1228.0, 838.0, 1228.0, 838.0, 1260.0, 134.0, 1260.0], "score": 0.98, "text": "known as the temporal stereo correspondence problem, yet"}, {"category_id": 15, "poly": [129.0, 1258.0, 841.0, 1260.0, 841.0, 1293.0, 129.0, 1290.0], "score": 0.98, "text": "the amount of research efforts oriented towards finding an"}, {"category_id": 15, "poly": [134.0, 1292.0, 760.0, 1292.0, 760.0, 1325.0, 134.0, 1325.0], "score": 0.99, "text": "effective solution to this problem is surprisingly small."}, {"category_id": 15, "poly": [157.0, 1320.0, 836.0, 1322.0, 836.0, 1361.0, 157.0, 1359.0], "score": 0.98, "text": " A method is proposed for real-time temporal stereo match-"}, {"category_id": 15, "poly": [134.0, 1361.0, 836.0, 1361.0, 836.0, 1393.0, 134.0, 1393.0], "score": 1.0, "text": "ing that efficiently propagates matching cost information be-"}, {"category_id": 15, "poly": [134.0, 1393.0, 836.0, 1393.0, 836.0, 1425.0, 134.0, 1425.0], "score": 0.99, "text": "tween consecutive frames of a stereo video sequence. This"}, {"category_id": 15, "poly": [132.0, 1423.0, 834.0, 1425.0, 834.0, 1458.0, 132.0, 1455.0], "score": 0.98, "text": "method is invariant to the number of prior frames being"}, {"category_id": 15, "poly": [134.0, 1458.0, 836.0, 1458.0, 836.0, 1490.0, 134.0, 1490.0], "score": 0.99, "text": "considered, and can be easily incorporated into any local stereo"}, {"category_id": 15, "poly": [132.0, 1492.0, 836.0, 1492.0, 836.0, 1524.0, 132.0, 1524.0], "score": 0.98, "text": "method based on edge-aware filters. The iterative adaptive"}, {"category_id": 15, "poly": [132.0, 1526.0, 838.0, 1526.0, 838.0, 1558.0, 132.0, 1558.0], "score": 0.99, "text": "support matching algorithm presented in [3] serves as a"}, {"category_id": 15, "poly": [132.0, 1558.0, 557.0, 1558.0, 557.0, 1590.0, 132.0, 1590.0], "score": 0.99, "text": "foundation for the proposed method."}, {"category_id": 15, "poly": [887.0, 1483.0, 1571.0, 1485.0, 1571.0, 1524.0, 887.0, 1522.0], "score": 0.98, "text": " In contrast, local methods, which are typically built upon"}, {"category_id": 15, "poly": [859.0, 1517.0, 1573.0, 1519.0, 1573.0, 1558.0, 859.0, 1556.0], "score": 0.97, "text": " the Winner-Takes-All (WTA) framework, have the property of "}, {"category_id": 15, "poly": [864.0, 1556.0, 1566.0, 1556.0, 1566.0, 1588.0, 864.0, 1588.0], "score": 0.99, "text": "computational regularity and are thus suitable for implemen-"}, {"category_id": 15, "poly": [862.0, 1588.0, 1566.0, 1588.0, 1566.0, 1620.0, 862.0, 1620.0], "score": 1.0, "text": "tation on parallel graphics hardware. Within the WTA frame-"}, {"category_id": 15, "poly": [862.0, 1616.0, 1568.0, 1618.0, 1568.0, 1657.0, 862.0, 1655.0], "score": 0.98, "text": "work, local stereo algorithms consider a range of disparity"}, {"category_id": 15, "poly": [864.0, 1655.0, 1566.0, 1655.0, 1566.0, 1687.0, 864.0, 1687.0], "score": 0.98, "text": "hypotheses and compute a volume of pixel-wise dissimilarity"}, {"category_id": 15, "poly": [862.0, 1689.0, 1571.0, 1689.0, 1571.0, 1721.0, 862.0, 1721.0], "score": 0.99, "text": "metrics between the reference image and the matched image at"}, {"category_id": 15, "poly": [862.0, 1723.0, 1568.0, 1721.0, 1568.0, 1753.0, 862.0, 1755.0], "score": 0.99, "text": "every considered disparity value. Final disparities are chosen"}, {"category_id": 15, "poly": [864.0, 1755.0, 1568.0, 1755.0, 1568.0, 1785.0, 864.0, 1785.0], "score": 1.0, "text": "from the cost volume by traversing through its values and"}, {"category_id": 15, "poly": [866.0, 1788.0, 1568.0, 1788.0, 1568.0, 1820.0, 866.0, 1820.0], "score": 0.99, "text": "selecting the disparities associated with minimum matching"}, {"category_id": 15, "poly": [859.0, 1817.0, 1377.0, 1820.0, 1377.0, 1859.0, 859.0, 1856.0], "score": 0.98, "text": " costs for every pixel of the reference image."}, {"category_id": 15, "poly": [885.0, 1187.0, 1571.0, 1187.0, 1571.0, 1226.0, 885.0, 1226.0], "score": 0.97, "text": " In their excellent taxonomy paper [1], Scharstein and"}, {"category_id": 15, "poly": [864.0, 1224.0, 1566.0, 1224.0, 1566.0, 1254.0, 864.0, 1254.0], "score": 0.99, "text": "Szeliski classify stereo algorithms as local or global meth-"}, {"category_id": 15, "poly": [859.0, 1249.0, 1571.0, 1254.0, 1570.0, 1293.0, 859.0, 1288.0], "score": 0.99, "text": " ods. Global methods, which offer outstanding accuracy, are"}, {"category_id": 15, "poly": [862.0, 1288.0, 1571.0, 1288.0, 1571.0, 1327.0, 862.0, 1327.0], "score": 0.98, "text": "typically derived from an energy minimization framework"}, {"category_id": 15, "poly": [859.0, 1322.0, 1566.0, 1322.0, 1566.0, 1352.0, 859.0, 1352.0], "score": 0.99, "text": "that allows for explicit integration of disparity smoothness"}, {"category_id": 15, "poly": [864.0, 1357.0, 1568.0, 1357.0, 1568.0, 1389.0, 864.0, 1389.0], "score": 0.99, "text": "constraints and thus is capable of regularizing the solution"}, {"category_id": 15, "poly": [864.0, 1391.0, 1568.0, 1391.0, 1568.0, 1421.0, 864.0, 1421.0], "score": 1.0, "text": "in weakly textured areas. The minimization, however, is often"}, {"category_id": 15, "poly": [864.0, 1423.0, 1568.0, 1423.0, 1568.0, 1455.0, 864.0, 1455.0], "score": 0.99, "text": "achieved using iterative methods or graph cuts, which do not"}, {"category_id": 15, "poly": [864.0, 1458.0, 1418.0, 1458.0, 1418.0, 1487.0, 864.0, 1487.0], "score": 0.99, "text": "lend themselves well to parallel implementation."}, {"category_id": 15, "poly": [155.0, 1650.0, 839.0, 1652.0, 838.0, 1691.0, 155.0, 1689.0], "score": 0.97, "text": " Stereo matching is the process of identifying correspon-"}, {"category_id": 15, "poly": [134.0, 1687.0, 838.0, 1687.0, 838.0, 1719.0, 134.0, 1719.0], "score": 0.99, "text": "dences between pixels in stereo images obtained using a"}, {"category_id": 15, "poly": [132.0, 1723.0, 838.0, 1721.0, 838.0, 1753.0, 132.0, 1755.0], "score": 0.98, "text": "pair of synchronized cameras. These correspondences are"}, {"category_id": 15, "poly": [134.0, 1755.0, 836.0, 1755.0, 836.0, 1788.0, 134.0, 1788.0], "score": 0.99, "text": "conveniently represented using the notion of disparity, i.e. the"}, {"category_id": 15, "poly": [134.0, 1788.0, 836.0, 1788.0, 836.0, 1820.0, 134.0, 1820.0], "score": 1.0, "text": "positional offset between two matching pixels. It is assumed"}, {"category_id": 15, "poly": [134.0, 1822.0, 836.0, 1822.0, 836.0, 1854.0, 134.0, 1854.0], "score": 0.99, "text": "that the stereo images are rectified, such that matching pixels"}, {"category_id": 15, "poly": [132.0, 1854.0, 836.0, 1854.0, 836.0, 1886.0, 132.0, 1886.0], "score": 0.99, "text": "are confined within corresponding rows of the images and"}, {"category_id": 15, "poly": [134.0, 1888.0, 838.0, 1888.0, 838.0, 1918.0, 134.0, 1918.0], "score": 1.0, "text": "thus disparities are restricted to the horizontal dimension, as"}, {"category_id": 15, "poly": [134.0, 1920.0, 838.0, 1920.0, 838.0, 1952.0, 134.0, 1952.0], "score": 1.0, "text": "illustrated in Figure 1. For visualization purposes, disparities"}, {"category_id": 15, "poly": [134.0, 1955.0, 838.0, 1955.0, 838.0, 1987.0, 134.0, 1987.0], "score": 0.99, "text": "recovered for every pixel of a reference image are stored"}, {"category_id": 15, "poly": [129.0, 1985.0, 841.0, 1982.0, 841.0, 2021.0, 129.0, 2024.0], "score": 0.98, "text": "together in the form of an image, which is known as the"}, {"category_id": 15, "poly": [370.0, 885.0, 594.0, 885.0, 594.0, 917.0, 370.0, 917.0], "score": 1.0, "text": "1. INTRODUCTION"}, {"category_id": 15, "poly": [638.0, 2099.0, 1062.0, 2099.0, 1062.0, 2131.0, 638.0, 2131.0], "score": 0.98, "text": "978-1-4673-5208-6/13/$31.00 @2013 IEEE"}, {"category_id": 15, "poly": [374.0, 1613.0, 591.0, 1613.0, 591.0, 1645.0, 374.0, 1645.0], "score": 0.95, "text": "II. BACKGROUND"}, {"category_id": 15, "poly": [859.0, 992.0, 1571.0, 995.0, 1571.0, 1034.0, 859.0, 1031.0], "score": 0.99, "text": " Figure 1: Geometry of two horizontally aligned views where p"}, {"category_id": 15, "poly": [864.0, 1098.0, 1291.0, 1098.0, 1291.0, 1130.0, 864.0, 1130.0], "score": 0.99, "text": "them along the horizontal dimension."}, {"category_id": 15, "poly": [859.0, 1061.0, 1194.0, 1059.0, 1194.0, 1098.0, 859.0, 1100.0], "score": 0.98, "text": " pixel in the target frame, and"}, {"category_id": 15, "poly": [1227.0, 1061.0, 1571.0, 1059.0, 1571.0, 1098.0, 1227.0, 1100.0], "score": 0.97, "text": " denotes the disparity between"}, {"category_id": 15, "poly": [864.0, 1034.0, 1303.0, 1034.0, 1303.0, 1063.0, 864.0, 1063.0], "score": 0.99, "text": "denotes a pixel in the reference frame,"}, {"category_id": 15, "poly": [1328.0, 1034.0, 1566.0, 1034.0, 1566.0, 1063.0, 1328.0, 1063.0], "score": 0.96, "text": " denotes its matching"}, {"category_id": 15, "poly": [508.0, 357.0, 1194.0, 360.0, 1194.0, 392.0, 508.0, 390.0], "score": 0.98, "text": "Jedrzej Kowalczuk, Eric T. Psota, and Lance C. P\u00e9rez"}, {"category_id": 15, "poly": [443.0, 392.0, 1245.0, 392.0, 1245.0, 424.0, 443.0, 424.0], "score": 0.99, "text": "Department of Electrical Engineering, University of Nebraska-Lincoln"}, {"category_id": 15, "poly": [614.0, 435.0, 1081.0, 435.0, 1081.0, 465.0, 614.0, 465.0], "score": 0.99, "text": "[jkowalczuk2,epsota,lperez] @unl.edu"}, {"category_id": 15, "poly": [159.0, 527.0, 836.0, 527.0, 836.0, 559.0, 159.0, 559.0], "score": 0.98, "text": "Abstract-Stereo matching algorithms are nearly always de-"}, {"category_id": 15, "poly": [132.0, 555.0, 838.0, 555.0, 838.0, 587.0, 132.0, 587.0], "score": 0.98, "text": "signed to find matches between a single pair of images. A method"}, {"category_id": 15, "poly": [134.0, 580.0, 836.0, 580.0, 836.0, 612.0, 134.0, 612.0], "score": 1.0, "text": "is presented that was specifically designed to operate on sequences"}, {"category_id": 15, "poly": [132.0, 605.0, 838.0, 607.0, 838.0, 646.0, 132.0, 644.0], "score": 0.99, "text": "of images. This method considers the cost of matching image"}, {"category_id": 15, "poly": [132.0, 637.0, 838.0, 637.0, 838.0, 669.0, 132.0, 669.0], "score": 0.98, "text": "points in both the spatial and temporal domain. To maintain"}, {"category_id": 15, "poly": [134.0, 667.0, 838.0, 667.0, 838.0, 699.0, 134.0, 699.0], "score": 0.97, "text": "real-time operation, a temporal cost aggregation method is used"}, {"category_id": 15, "poly": [132.0, 692.0, 836.0, 692.0, 836.0, 722.0, 132.0, 722.0], "score": 0.98, "text": "to evaluate the likelihood of matches that is invariant with respect"}, {"category_id": 15, "poly": [127.0, 717.0, 841.0, 715.0, 841.0, 754.0, 127.0, 756.0], "score": 0.97, "text": "to the number of prior images being considered. This method"}, {"category_id": 15, "poly": [127.0, 742.0, 841.0, 745.0, 841.0, 784.0, 127.0, 781.0], "score": 0.98, "text": "has been implemented on massively parallel GPU hardware,"}, {"category_id": 15, "poly": [132.0, 777.0, 838.0, 777.0, 838.0, 809.0, 132.0, 809.0], "score": 0.99, "text": "and the implementation ranks as one of the fastest and most"}, {"category_id": 15, "poly": [132.0, 802.0, 838.0, 804.0, 838.0, 836.0, 132.0, 834.0], "score": 0.99, "text": "accurate real-time stereo matching methods as measured by the"}, {"category_id": 15, "poly": [134.0, 830.0, 619.0, 830.0, 619.0, 862.0, 134.0, 862.0], "score": 0.99, "text": "Middlebury stereo performance benchmark."}, {"category_id": 15, "poly": [887.0, 1849.0, 1568.0, 1852.0, 1568.0, 1891.0, 887.0, 1888.0], "score": 0.99, "text": " Disparity maps obtained using this simple strategy are often"}, {"category_id": 15, "poly": [862.0, 1888.0, 1568.0, 1888.0, 1568.0, 1920.0, 862.0, 1920.0], "score": 0.98, "text": "too noisy to be considered useable. To reduce the effects"}, {"category_id": 15, "poly": [864.0, 1923.0, 1568.0, 1923.0, 1568.0, 1952.0, 864.0, 1952.0], "score": 0.99, "text": "of noise and enforce spatial consistency of matches, local"}, {"category_id": 15, "poly": [862.0, 1948.0, 1568.0, 1950.0, 1568.0, 1989.0, 861.0, 1987.0], "score": 0.99, "text": "stereo algorithms consider arbitrarily shaped and sized support"}, {"category_id": 15, "poly": [864.0, 1989.0, 1568.0, 1989.0, 1568.0, 2021.0, 864.0, 2021.0], "score": 0.99, "text": "windows centered at each pixel of the reference image, and"}], "page_info": {"page_no": 0, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 8, "poly": [962.3624267578125, 1513.2073974609375, 1465.4017333984375, 1513.2073974609375, 1465.4017333984375, 1669.1397705078125, 962.3624267578125, 1669.1397705078125], "score": 0.9999995231628418}, {"category_id": 9, "poly": [1530.72998046875, 1101.879638671875, 1565.2568359375, 1101.879638671875, 1565.2568359375, 1130.8609619140625, 1530.72998046875, 1130.8609619140625], "score": 0.9999992251396179}, {"category_id": 9, "poly": [1529.8787841796875, 1575.843505859375, 1565.931396484375, 1575.843505859375, 1565.931396484375, 1607.2161865234375, 1529.8787841796875, 1607.2161865234375], "score": 0.9999987483024597}, {"category_id": 1, "poly": [865.1971435546875, 1684.040283203125, 1566.561279296875, 1684.040283203125, 1566.561279296875, 1813.7021484375, 865.1971435546875, 1813.7021484375], "score": 0.9999987483024597}, {"category_id": 9, "poly": [1530.5263671875, 1839.3990478515625, 1565.1201171875, 1839.3990478515625, 1565.1201171875, 1869.825439453125, 1530.5263671875, 1869.825439453125], "score": 0.9999977946281433}, {"category_id": 8, "poly": [972.3255004882812, 1075.85498046875, 1461.2088623046875, 1075.85498046875, 1461.2088623046875, 1155.465087890625, 972.3255004882812, 1155.465087890625], "score": 0.999996542930603}, {"category_id": 1, "poly": [865.4874267578125, 158.47100830078125, 1565.84375, 158.47100830078125, 1565.84375, 355.3230285644531, 865.4874267578125, 355.3230285644531], "score": 0.9999960660934448}, {"category_id": 1, "poly": [133.51382446289062, 158.21670532226562, 835.5382080078125, 158.21670532226562, 835.5382080078125, 558.8020629882812, 133.51382446289062, 558.8020629882812], "score": 0.9999951124191284}, {"category_id": 1, "poly": [134.01239013671875, 954.4151000976562, 836.1470336914062, 954.4151000976562, 836.1470336914062, 1618.77197265625, 134.01239013671875, 1618.77197265625], "score": 0.9999947547912598}, {"category_id": 1, "poly": [134.4542999267578, 558.8201904296875, 834.2548828125, 558.8201904296875, 834.2548828125, 954.7811279296875, 134.4542999267578, 954.7811279296875], "score": 0.9999943971633911}, {"category_id": 1, "poly": [866.33642578125, 421.84442138671875, 1566.451904296875, 421.84442138671875, 1566.451904296875, 787.1864624023438, 866.33642578125, 787.1864624023438], "score": 0.9999930262565613}, {"category_id": 1, "poly": [864.974853515625, 1167.92236328125, 1567.0927734375, 1167.92236328125, 1567.0927734375, 1298.29541015625, 864.974853515625, 1298.29541015625], "score": 0.9999929666519165}, {"category_id": 1, "poly": [864.5220947265625, 853.943359375, 1565.82080078125, 853.943359375, 1565.82080078125, 1080.8125, 864.5220947265625, 1080.8125], "score": 0.9999923706054688}, {"category_id": 1, "poly": [865.4466552734375, 1919.30615234375, 1566.4720458984375, 1919.30615234375, 1566.4720458984375, 2017.154541015625, 865.4466552734375, 2017.154541015625], "score": 0.9999904036521912}, {"category_id": 1, "poly": [864.801513671875, 1302.438232421875, 1566.760986328125, 1302.438232421875, 1566.760986328125, 1498.9681396484375, 864.801513671875, 1498.9681396484375], "score": 0.9999889135360718}, {"category_id": 1, "poly": [133.34628295898438, 1620.0596923828125, 836.7553100585938, 1620.0596923828125, 836.7553100585938, 2018.44873046875, 133.34628295898438, 2018.44873046875], "score": 0.9999861717224121}, {"category_id": 0, "poly": [865.5296020507812, 809.8997802734375, 1302.7711181640625, 809.8997802734375, 1302.7711181640625, 841.3140869140625, 865.5296020507812, 841.3140869140625], "score": 0.9999798536300659}, {"category_id": 0, "poly": [1131.11181640625, 378.66229248046875, 1299.6181640625, 378.66229248046875, 1299.6181640625, 409.04852294921875, 1131.11181640625, 409.04852294921875], "score": 0.9999651908874512}, {"category_id": 8, "poly": [1003.5569458007812, 1824.2362060546875, 1420.7132568359375, 1824.2362060546875, 1420.7132568359375, 1905.175048828125, 1003.5569458007812, 1905.175048828125], "score": 0.999914288520813}, {"category_id": 14, "poly": [974, 1076, 1454, 1076, 1454, 1155, 974, 1155], "score": 0.94, "latex": "w(p,q)=\\exp{\\left(-\\frac{\\Delta_{g}(p,q)}{\\gamma_{g}}-\\frac{\\Delta_{c}(p,q)}{\\gamma_{c}}\\right)},"}, {"category_id": 14, "poly": [1006, 1825, 1423, 1825, 1423, 1907, 1006, 1907], "score": 0.94, "latex": "\\delta(q,\\bar{q})=\\sum_{c=\\{r,g,b\\}}\\operatorname*{min}(|q_{c}-\\bar{q}_{c}|,\\tau)."}, {"category_id": 14, "poly": [963, 1510, 1464, 1510, 1464, 1671, 963, 1671], "score": 0.93, "latex": "C(p,\\bar{p})=\\frac{\\displaystyle\\sum_{q\\in\\Omega_{p},\\bar{q}\\in\\Omega_{\\bar{p}}}w(p,q)w(\\bar{p},\\bar{q})\\delta(q,\\bar{q})}{\\displaystyle\\sum_{q\\in\\Omega_{p},\\bar{q}\\in\\Omega_{\\bar{p}}}w(p,q)w(\\bar{p},\\bar{q})}\\,,"}, {"category_id": 13, "poly": [1335, 1166, 1432, 1166, 1432, 1200, 1335, 1200], "score": 0.93, "latex": "\\Delta_{c}(p,q)"}, {"category_id": 13, "poly": [939, 1166, 1039, 1166, 1039, 1201, 939, 1201], "score": 0.93, "latex": "\\Delta_{g}(p,q)"}, {"category_id": 13, "poly": [1289, 1683, 1365, 1683, 1365, 1717, 1289, 1717], "score": 0.93, "latex": "\\delta(q,\\bar{q})"}, {"category_id": 13, "poly": [1362, 1367, 1441, 1367, 1441, 1401, 1362, 1401], "score": 0.92, "latex": "\\bar{p}\\in S_{p}"}, {"category_id": 13, "poly": [864, 1019, 951, 1019, 951, 1053, 864, 1053], "score": 0.92, "latex": "q\\in\\Omega_{p}"}, {"category_id": 13, "poly": [1351, 953, 1388, 953, 1388, 987, 1351, 987], "score": 0.9, "latex": "\\Omega_{p}"}, {"category_id": 13, "poly": [913, 1467, 949, 1467, 949, 1501, 913, 1501], "score": 0.89, "latex": "\\Omega_{\\bar{p}}"}, {"category_id": 13, "poly": [1531, 1367, 1565, 1367, 1565, 1401, 1531, 1401], "score": 0.89, "latex": "S_{p}"}, {"category_id": 13, "poly": [1528, 1434, 1565, 1434, 1565, 1468, 1528, 1468], "score": 0.89, "latex": "\\Omega_{p}"}, {"category_id": 13, "poly": [1485, 1205, 1516, 1205, 1516, 1234, 1485, 1234], "score": 0.88, "latex": "\\gamma_{g}"}, {"category_id": 13, "poly": [1159, 1206, 1178, 1206, 1178, 1233, 1159, 1233], "score": 0.82, "latex": "p"}, {"category_id": 13, "poly": [863, 1238, 893, 1238, 893, 1266, 863, 1266], "score": 0.82, "latex": "\\gamma_{c}"}, {"category_id": 13, "poly": [1177, 1436, 1196, 1436, 1196, 1465, 1177, 1465], "score": 0.8, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [1371, 1024, 1391, 1024, 1391, 1051, 1371, 1051], "score": 0.8, "latex": "p"}, {"category_id": 13, "poly": [1540, 1406, 1558, 1406, 1558, 1432, 1540, 1432], "score": 0.8, "latex": "p"}, {"category_id": 13, "poly": [1447, 1024, 1465, 1024, 1465, 1051, 1447, 1051], "score": 0.79, "latex": "q"}, {"category_id": 13, "poly": [1101, 1437, 1121, 1437, 1121, 1465, 1101, 1465], "score": 0.79, "latex": "p"}, {"category_id": 13, "poly": [1389, 1307, 1407, 1307, 1407, 1332, 1389, 1332], "score": 0.79, "latex": "p"}, {"category_id": 13, "poly": [1230, 1206, 1247, 1206, 1247, 1233, 1230, 1233], "score": 0.78, "latex": "q"}, {"category_id": 13, "poly": [1029, 1372, 1048, 1372, 1048, 1399, 1029, 1399], "score": 0.78, "latex": "p"}, {"category_id": 13, "poly": [916, 1752, 934, 1752, 934, 1782, 916, 1782], "score": 0.76, "latex": "\\bar{q}"}, {"category_id": 13, "poly": [1407, 1925, 1425, 1925, 1425, 1946, 1407, 1946], "score": 0.75, "latex": "\\tau"}, {"category_id": 13, "poly": [1548, 1722, 1565, 1722, 1565, 1749, 1548, 1749], "score": 0.75, "latex": "q"}, {"category_id": 13, "poly": [1050, 992, 1068, 992, 1068, 1018, 1050, 1018], "score": 0.75, "latex": "p"}, {"category_id": 15, "poly": [864.0, 1783.0, 1298.0, 1783.0, 1298.0, 1822.0, 864.0, 1822.0], "score": 0.99, "text": "green, and blue components given by"}, {"category_id": 15, "poly": [866.0, 1687.0, 1288.0, 1687.0, 1288.0, 1719.0, 866.0, 1719.0], "score": 0.96, "text": "where the pixel dissimilarity metric"}, {"category_id": 15, "poly": [1366.0, 1687.0, 1564.0, 1687.0, 1564.0, 1719.0, 1366.0, 1719.0], "score": 0.97, "text": "ischosen as the"}, {"category_id": 15, "poly": [866.0, 1751.0, 915.0, 1751.0, 915.0, 1783.0, 866.0, 1783.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [935.0, 1751.0, 1564.0, 1751.0, 1564.0, 1783.0, 935.0, 1783.0], "score": 0.98, "text": ". Here, the truncation of color difference for the red,"}, {"category_id": 15, "poly": [866.0, 1719.0, 1547.0, 1719.0, 1547.0, 1749.0, 866.0, 1749.0], "score": 0.99, "text": "sum of truncated absolute color differences between pixels"}, {"category_id": 15, "poly": [864.0, 163.0, 1568.0, 163.0, 1568.0, 192.0, 864.0, 192.0], "score": 1.0, "text": "temporal information, making it possible to process a temporal"}, {"category_id": 15, "poly": [859.0, 188.0, 1571.0, 193.0, 1570.0, 229.0, 859.0, 225.0], "score": 0.99, "text": " collection of cost volumes. The filtering operation was shown"}, {"category_id": 15, "poly": [864.0, 229.0, 1566.0, 229.0, 1566.0, 261.0, 864.0, 261.0], "score": 0.99, "text": "to preserve spatio-temporal edges present in the cost volumes,"}, {"category_id": 15, "poly": [859.0, 261.0, 1564.0, 264.0, 1564.0, 296.0, 859.0, 293.0], "score": 0.98, "text": " resulting in increased temporal consistency of disparity maps,"}, {"category_id": 15, "poly": [864.0, 296.0, 1566.0, 296.0, 1566.0, 328.0, 864.0, 328.0], "score": 0.99, "text": "greater robustness to image noise, and more accurate behavior"}, {"category_id": 15, "poly": [866.0, 328.0, 1160.0, 328.0, 1160.0, 360.0, 866.0, 360.0], "score": 1.0, "text": "around object boundaries."}, {"category_id": 15, "poly": [129.0, 158.0, 841.0, 153.0, 841.0, 192.0, 130.0, 197.0], "score": 0.99, "text": "aggregate cost values within the pixel neighborhoods defined"}, {"category_id": 15, "poly": [129.0, 188.0, 841.0, 190.0, 841.0, 229.0, 129.0, 227.0], "score": 0.99, "text": "by these windows. In 2005, Yoon and Kweon [4] proposed"}, {"category_id": 15, "poly": [132.0, 229.0, 838.0, 229.0, 838.0, 261.0, 132.0, 261.0], "score": 1.0, "text": "an adaptive matching cost aggregation scheme, which assigns"}, {"category_id": 15, "poly": [132.0, 261.0, 838.0, 261.0, 838.0, 293.0, 132.0, 293.0], "score": 0.98, "text": "a weight value to every pixel located in the support window"}, {"category_id": 15, "poly": [132.0, 293.0, 838.0, 293.0, 838.0, 325.0, 132.0, 325.0], "score": 0.98, "text": "of a given pixel of interest. The weight value is based on"}, {"category_id": 15, "poly": [132.0, 328.0, 836.0, 328.0, 836.0, 360.0, 132.0, 360.0], "score": 0.99, "text": "the spatial and color similarity between the pixel of interest"}, {"category_id": 15, "poly": [134.0, 360.0, 836.0, 360.0, 836.0, 392.0, 134.0, 392.0], "score": 1.0, "text": "and a pixel in its support window, and the aggregated cost is"}, {"category_id": 15, "poly": [134.0, 394.0, 836.0, 394.0, 836.0, 426.0, 134.0, 426.0], "score": 0.99, "text": "computed as a weighted average of the pixel-wise costs within"}, {"category_id": 15, "poly": [127.0, 422.0, 839.0, 424.0, 838.0, 463.0, 127.0, 461.0], "score": 0.98, "text": " the considered support window. The edge-preserving nature"}, {"category_id": 15, "poly": [129.0, 456.0, 838.0, 454.0, 838.0, 493.0, 129.0, 495.0], "score": 0.99, "text": " and matching accuracy of adaptive support weights have made"}, {"category_id": 15, "poly": [132.0, 490.0, 841.0, 490.0, 841.0, 529.0, 132.0, 529.0], "score": 0.99, "text": "them one of the most popular choices for cost aggregation in"}, {"category_id": 15, "poly": [132.0, 527.0, 797.0, 527.0, 797.0, 559.0, 132.0, 559.0], "score": 0.97, "text": "recently proposed stereo matching algorithms [3], [5]-[8]."}, {"category_id": 15, "poly": [157.0, 958.0, 836.0, 958.0, 836.0, 988.0, 157.0, 988.0], "score": 0.99, "text": "It has been demonstrated that the performance of stereo"}, {"category_id": 15, "poly": [132.0, 990.0, 838.0, 990.0, 838.0, 1022.0, 132.0, 1022.0], "score": 0.99, "text": "algorithms designed to match a single pair of images can"}, {"category_id": 15, "poly": [132.0, 1024.0, 836.0, 1024.0, 836.0, 1056.0, 132.0, 1056.0], "score": 0.99, "text": "be adapted to take advantage of the temporal dependencies"}, {"category_id": 15, "poly": [129.0, 1054.0, 838.0, 1054.0, 838.0, 1093.0, 129.0, 1093.0], "score": 0.97, "text": "available in stereo video sequences. Early proposed solutions"}, {"category_id": 15, "poly": [132.0, 1091.0, 836.0, 1091.0, 836.0, 1123.0, 132.0, 1123.0], "score": 0.99, "text": "to temporal stereo matching attempted to average matching"}, {"category_id": 15, "poly": [134.0, 1123.0, 836.0, 1123.0, 836.0, 1155.0, 134.0, 1155.0], "score": 0.99, "text": "costs across subsequent frames of a video sequence [13],"}, {"category_id": 15, "poly": [129.0, 1153.0, 841.0, 1150.0, 841.0, 1189.0, 129.0, 1192.0], "score": 0.98, "text": "[14]. Attempts have been made to integrate estimation of"}, {"category_id": 15, "poly": [134.0, 1192.0, 838.0, 1192.0, 838.0, 1224.0, 134.0, 1224.0], "score": 0.99, "text": "motion fields (optical flow) into temporal stereo matching. The"}, {"category_id": 15, "poly": [132.0, 1224.0, 838.0, 1224.0, 838.0, 1256.0, 132.0, 1256.0], "score": 0.99, "text": "methods of [15] and [16] perform smoothing of disparities"}, {"category_id": 15, "poly": [129.0, 1254.0, 841.0, 1254.0, 841.0, 1292.0, 129.0, 1292.0], "score": 0.99, "text": " along motion vectors recovered from the video sequence. The"}, {"category_id": 15, "poly": [132.0, 1290.0, 838.0, 1290.0, 838.0, 1322.0, 132.0, 1322.0], "score": 0.99, "text": "estimation of the motion field, however, prevents real-time"}, {"category_id": 15, "poly": [132.0, 1325.0, 838.0, 1325.0, 838.0, 1354.0, 132.0, 1354.0], "score": 0.99, "text": "implementation, since state-of-the-art optical flow algorithms"}, {"category_id": 15, "poly": [129.0, 1354.0, 841.0, 1354.0, 841.0, 1393.0, 129.0, 1393.0], "score": 0.99, "text": " do not, in general, approach real-time frame rates. In a related"}, {"category_id": 15, "poly": [129.0, 1386.0, 841.0, 1384.0, 841.0, 1423.0, 129.0, 1425.0], "score": 0.99, "text": "approach, Sizintsev and Wildes [17], [18] used steerable"}, {"category_id": 15, "poly": [134.0, 1423.0, 836.0, 1423.0, 836.0, 1455.0, 134.0, 1455.0], "score": 0.99, "text": "filters to obtain descriptors characterizing motion of image"}, {"category_id": 15, "poly": [134.0, 1455.0, 836.0, 1455.0, 836.0, 1487.0, 134.0, 1487.0], "score": 0.99, "text": "features in both space and time. Unlike traditional algorithms,"}, {"category_id": 15, "poly": [132.0, 1490.0, 838.0, 1490.0, 838.0, 1522.0, 132.0, 1522.0], "score": 0.98, "text": "their method performs matching on spatio-temporal motion"}, {"category_id": 15, "poly": [129.0, 1519.0, 841.0, 1517.0, 841.0, 1556.0, 129.0, 1558.0], "score": 0.99, "text": " descriptors, rather than on pure pixel intensity values, which"}, {"category_id": 15, "poly": [132.0, 1554.0, 841.0, 1554.0, 841.0, 1593.0, 132.0, 1593.0], "score": 0.99, "text": "leads to improved temporal coherence of disparity maps at the"}, {"category_id": 15, "poly": [132.0, 1586.0, 698.0, 1586.0, 698.0, 1618.0, 132.0, 1618.0], "score": 0.99, "text": "cost of reduced accuracy at depth discontinuities."}, {"category_id": 15, "poly": [159.0, 559.0, 838.0, 559.0, 838.0, 591.0, 159.0, 591.0], "score": 0.99, "text": "Recently, Rheman et al. [9], [10] have revisited the cost"}, {"category_id": 15, "poly": [132.0, 594.0, 838.0, 589.0, 839.0, 621.0, 132.0, 626.0], "score": 1.0, "text": "aggregation step of stereo algorithms, and demonstrated that"}, {"category_id": 15, "poly": [132.0, 626.0, 838.0, 626.0, 838.0, 658.0, 132.0, 658.0], "score": 0.99, "text": "cost aggregation can be performed by filtering of subsequent"}, {"category_id": 15, "poly": [134.0, 660.0, 834.0, 660.0, 834.0, 692.0, 134.0, 692.0], "score": 1.0, "text": "layers of the initially computed matching cost volume. In par-"}, {"category_id": 15, "poly": [132.0, 692.0, 836.0, 692.0, 836.0, 724.0, 132.0, 724.0], "score": 0.99, "text": "ticular, the edge-aware image filters, such as the bilateral filter"}, {"category_id": 15, "poly": [127.0, 719.0, 839.0, 724.0, 838.0, 761.0, 127.0, 756.0], "score": 0.99, "text": " of Tomasi and Manducci [11] or the guided filter of He [12],"}, {"category_id": 15, "poly": [132.0, 759.0, 838.0, 759.0, 838.0, 791.0, 132.0, 791.0], "score": 0.98, "text": "have been rendered useful for the problem of matching cost"}, {"category_id": 15, "poly": [132.0, 793.0, 838.0, 791.0, 838.0, 823.0, 132.0, 825.0], "score": 0.99, "text": "aggregation, enabling stereo algorithms to correctly recover"}, {"category_id": 15, "poly": [134.0, 825.0, 838.0, 825.0, 838.0, 857.0, 134.0, 857.0], "score": 0.98, "text": "disparities along object boundaries. In fact, Yoon and Kweon's"}, {"category_id": 15, "poly": [134.0, 859.0, 838.0, 859.0, 838.0, 891.0, 134.0, 891.0], "score": 1.0, "text": "adaptive support-weight cost aggregation scheme is equivalent"}, {"category_id": 15, "poly": [132.0, 891.0, 838.0, 891.0, 838.0, 924.0, 132.0, 924.0], "score": 0.98, "text": "to the application of the so-called joint bilateral filter to the"}, {"category_id": 15, "poly": [134.0, 924.0, 547.0, 924.0, 547.0, 956.0, 134.0, 956.0], "score": 1.0, "text": "layers of the matching cost volume."}, {"category_id": 15, "poly": [889.0, 422.0, 1568.0, 424.0, 1568.0, 456.0, 889.0, 454.0], "score": 0.98, "text": "The proposed temporal stereo matching algorithm is an"}, {"category_id": 15, "poly": [862.0, 456.0, 1571.0, 456.0, 1571.0, 495.0, 862.0, 495.0], "score": 1.0, "text": "extension of the real-time iterative adaptive support-weight"}, {"category_id": 15, "poly": [864.0, 490.0, 1568.0, 490.0, 1568.0, 522.0, 864.0, 522.0], "score": 0.99, "text": "algorithm described in [3]. In addition to real-time two-"}, {"category_id": 15, "poly": [864.0, 525.0, 1566.0, 525.0, 1566.0, 557.0, 864.0, 557.0], "score": 1.0, "text": "pass aggregation of the cost values in the spatial domain,"}, {"category_id": 15, "poly": [864.0, 557.0, 1568.0, 557.0, 1568.0, 589.0, 864.0, 589.0], "score": 0.99, "text": "the proposed algorithm enhances stereo matching on video"}, {"category_id": 15, "poly": [866.0, 594.0, 1566.0, 594.0, 1566.0, 626.0, 866.0, 626.0], "score": 0.97, "text": "sequences by aggregating costs along the time dimension."}, {"category_id": 15, "poly": [864.0, 626.0, 1568.0, 626.0, 1568.0, 658.0, 864.0, 658.0], "score": 1.0, "text": "The operation of the algorithm has been divided into four"}, {"category_id": 15, "poly": [866.0, 660.0, 1568.0, 660.0, 1568.0, 692.0, 866.0, 692.0], "score": 0.99, "text": "stages: 1) two-pass spatial cost aggregation, 2) temporal cost"}, {"category_id": 15, "poly": [862.0, 688.0, 1568.0, 685.0, 1568.0, 724.0, 862.0, 727.0], "score": 1.0, "text": "aggregation, 3) disparity selection and confidence assessment,"}, {"category_id": 15, "poly": [866.0, 724.0, 1568.0, 724.0, 1568.0, 756.0, 866.0, 756.0], "score": 1.0, "text": "and 4) iterative disparity refinement. In the following, each of"}, {"category_id": 15, "poly": [864.0, 759.0, 1254.0, 759.0, 1254.0, 791.0, 864.0, 791.0], "score": 1.0, "text": "these stages is described in detail."}, {"category_id": 15, "poly": [860.0, 1265.0, 1194.0, 1270.0, 1194.0, 1306.0, 859.0, 1301.0], "score": 0.99, "text": " color similarity, respectively."}, {"category_id": 15, "poly": [1433.0, 1169.0, 1566.0, 1169.0, 1566.0, 1201.0, 1433.0, 1201.0], "score": 0.98, "text": "is the color"}, {"category_id": 15, "poly": [864.0, 1169.0, 938.0, 1169.0, 938.0, 1201.0, 864.0, 1201.0], "score": 1.0, "text": "where"}, {"category_id": 15, "poly": [1040.0, 1169.0, 1334.0, 1169.0, 1334.0, 1201.0, 1040.0, 1201.0], "score": 0.98, "text": "is the geometric distance,"}, {"category_id": 15, "poly": [1517.0, 1196.0, 1566.0, 1201.0, 1566.0, 1240.0, 1517.0, 1235.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [862.0, 1196.0, 1158.0, 1201.0, 1158.0, 1240.0, 861.0, 1235.0], "score": 1.0, "text": "difference between pixels"}, {"category_id": 15, "poly": [894.0, 1233.0, 1566.0, 1231.0, 1566.0, 1270.0, 894.0, 1272.0], "score": 0.97, "text": "regulate the strength of grouping by geometric distance and"}, {"category_id": 15, "poly": [1179.0, 1196.0, 1229.0, 1201.0, 1229.0, 1240.0, 1179.0, 1235.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [1248.0, 1196.0, 1484.0, 1201.0, 1484.0, 1240.0, 1248.0, 1235.0], "score": 0.99, "text": ", and the coefficients"}, {"category_id": 15, "poly": [887.0, 848.0, 1568.0, 850.0, 1568.0, 889.0, 887.0, 887.0], "score": 0.99, "text": " Humans group shapes by observing the geometric distance"}, {"category_id": 15, "poly": [859.0, 885.0, 1568.0, 882.0, 1568.0, 921.0, 859.0, 924.0], "score": 0.98, "text": " and color similarity of points in space. To mimic this vi-"}, {"category_id": 15, "poly": [864.0, 921.0, 1568.0, 921.0, 1568.0, 953.0, 864.0, 953.0], "score": 0.99, "text": "sual grouping, the adaptive support-weight stereo matching"}, {"category_id": 15, "poly": [864.0, 1054.0, 899.0, 1054.0, 899.0, 1084.0, 864.0, 1084.0], "score": 1.0, "text": "by"}, {"category_id": 15, "poly": [866.0, 956.0, 1350.0, 956.0, 1350.0, 988.0, 866.0, 988.0], "score": 0.98, "text": "algorithm [4] considers a support window"}, {"category_id": 15, "poly": [1389.0, 956.0, 1566.0, 956.0, 1566.0, 988.0, 1389.0, 988.0], "score": 0.98, "text": " centered at the"}, {"category_id": 15, "poly": [952.0, 1022.0, 1370.0, 1022.0, 1370.0, 1054.0, 952.0, 1054.0], "score": 0.98, "text": ". The support weight relating pixels"}, {"category_id": 15, "poly": [1392.0, 1022.0, 1446.0, 1022.0, 1446.0, 1054.0, 1392.0, 1054.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [1466.0, 1022.0, 1566.0, 1022.0, 1566.0, 1054.0, 1466.0, 1054.0], "score": 0.98, "text": "is given"}, {"category_id": 15, "poly": [866.0, 990.0, 1049.0, 990.0, 1049.0, 1022.0, 866.0, 1022.0], "score": 1.0, "text": "pixel of interest"}, {"category_id": 15, "poly": [1069.0, 990.0, 1566.0, 990.0, 1566.0, 1022.0, 1069.0, 1022.0], "score": 1.0, "text": ", and assigns a support weight to each pixel"}, {"category_id": 15, "poly": [862.0, 1948.0, 1568.0, 1950.0, 1568.0, 1989.0, 861.0, 1987.0], "score": 0.98, "text": "vides additional robustness to outliers. Rather than evaluating"}, {"category_id": 15, "poly": [864.0, 1989.0, 1566.0, 1989.0, 1566.0, 2021.0, 864.0, 2021.0], "score": 0.98, "text": "Equation (2) directly, real-time algorithms often approximate"}, {"category_id": 15, "poly": [862.0, 1920.0, 1406.0, 1920.0, 1406.0, 1952.0, 862.0, 1952.0], "score": 0.99, "text": "This limits each of their magnitudes to at most"}, {"category_id": 15, "poly": [1426.0, 1920.0, 1561.0, 1920.0, 1561.0, 1952.0, 1426.0, 1952.0], "score": 0.96, "text": ",whichpro-"}, {"category_id": 15, "poly": [859.0, 1331.0, 1571.0, 1334.0, 1571.0, 1373.0, 859.0, 1370.0], "score": 0.98, "text": " iterative adaptive support-weight algorithm evaluates matching"}, {"category_id": 15, "poly": [859.0, 1464.0, 912.0, 1467.0, 912.0, 1506.0, 859.0, 1503.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [950.0, 1464.0, 1474.0, 1467.0, 1474.0, 1506.0, 950.0, 1503.0], "score": 1.0, "text": ", the initial matching cost is aggregated using"}, {"category_id": 15, "poly": [1442.0, 1370.0, 1530.0, 1370.0, 1530.0, 1402.0, 1442.0, 1402.0], "score": 0.98, "text": ", where"}, {"category_id": 15, "poly": [1197.0, 1437.0, 1527.0, 1437.0, 1527.0, 1469.0, 1197.0, 1469.0], "score": 0.97, "text": ", and their support windows"}, {"category_id": 15, "poly": [866.0, 1402.0, 1539.0, 1402.0, 1539.0, 1435.0, 866.0, 1435.0], "score": 1.0, "text": "denotes a set of matching candidates associated with pixel"}, {"category_id": 15, "poly": [864.0, 1437.0, 1100.0, 1437.0, 1100.0, 1469.0, 864.0, 1469.0], "score": 0.97, "text": "For a pair of pixels"}, {"category_id": 15, "poly": [1122.0, 1437.0, 1176.0, 1437.0, 1176.0, 1469.0, 1122.0, 1469.0], "score": 0.94, "text": " and"}, {"category_id": 15, "poly": [887.0, 1299.0, 1388.0, 1304.0, 1388.0, 1336.0, 887.0, 1331.0], "score": 0.96, "text": " To identify a match for the pixel of interest"}, {"category_id": 15, "poly": [1408.0, 1299.0, 1568.0, 1304.0, 1568.0, 1336.0, 1408.0, 1331.0], "score": 1.0, "text": ", the real-time"}, {"category_id": 15, "poly": [864.0, 1370.0, 1028.0, 1370.0, 1028.0, 1402.0, 864.0, 1402.0], "score": 1.0, "text": "costs between"}, {"category_id": 15, "poly": [1049.0, 1370.0, 1361.0, 1370.0, 1361.0, 1402.0, 1049.0, 1402.0], "score": 0.99, "text": " and every match candidate"}, {"category_id": 15, "poly": [160.0, 1618.0, 836.0, 1623.0, 836.0, 1655.0, 159.0, 1650.0], "score": 0.99, "text": "Most recently, local stereo algorithms based on edge-aware"}, {"category_id": 15, "poly": [127.0, 1650.0, 841.0, 1652.0, 841.0, 1691.0, 127.0, 1689.0], "score": 0.97, "text": " filters were extended to incorporate temporal evidence into"}, {"category_id": 15, "poly": [132.0, 1687.0, 836.0, 1687.0, 836.0, 1719.0, 132.0, 1719.0], "score": 0.97, "text": "the matching process. The method of Richardt et al. [19]"}, {"category_id": 15, "poly": [134.0, 1723.0, 838.0, 1723.0, 838.0, 1753.0, 134.0, 1753.0], "score": 0.99, "text": "employs a variant of the bilateral grid [20] implemented on"}, {"category_id": 15, "poly": [134.0, 1755.0, 838.0, 1755.0, 838.0, 1788.0, 134.0, 1788.0], "score": 0.99, "text": "graphics hardware, which accelerates cost aggregation and"}, {"category_id": 15, "poly": [134.0, 1788.0, 838.0, 1788.0, 838.0, 1820.0, 134.0, 1820.0], "score": 1.0, "text": "allows for weighted propagation of pixel dissimilarity metrics"}, {"category_id": 15, "poly": [132.0, 1822.0, 838.0, 1822.0, 838.0, 1854.0, 132.0, 1854.0], "score": 0.99, "text": "from previous frames to the current one. Although this method"}, {"category_id": 15, "poly": [129.0, 1856.0, 838.0, 1856.0, 838.0, 1888.0, 129.0, 1888.0], "score": 1.0, "text": " outperforms the baseline frame-to-frame approach, the amount"}, {"category_id": 15, "poly": [132.0, 1888.0, 838.0, 1888.0, 838.0, 1920.0, 132.0, 1920.0], "score": 0.97, "text": "of hardware memory necessary to construct the bilateral grid"}, {"category_id": 15, "poly": [127.0, 1916.0, 841.0, 1918.0, 841.0, 1957.0, 127.0, 1955.0], "score": 0.99, "text": "limits its application to single-channel, i.e., grayscale images "}, {"category_id": 15, "poly": [132.0, 1955.0, 838.0, 1955.0, 838.0, 1985.0, 132.0, 1985.0], "score": 0.99, "text": "only. Hosni et al. [10], on the other hand, reformulated kernels"}, {"category_id": 15, "poly": [132.0, 1989.0, 838.0, 1989.0, 838.0, 2021.0, 132.0, 2021.0], "score": 0.99, "text": "of the guided image filter to operate on both spatial and"}, {"category_id": 15, "poly": [859.0, 809.0, 1307.0, 809.0, 1307.0, 848.0, 859.0, 848.0], "score": 0.99, "text": "A. Two-Pass Spatial Cost Aggregation"}, {"category_id": 15, "poly": [1129.0, 376.0, 1300.0, 376.0, 1300.0, 417.0, 1129.0, 417.0], "score": 0.94, "text": "III. METHOD"}], "page_info": {"page_no": 1, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 1, "poly": [865.5088500976562, 856.5537109375, 1567.692626953125, 856.5537109375, 1567.692626953125, 1420.9698486328125, 865.5088500976562, 1420.9698486328125], "score": 0.9999963045120239}, {"category_id": 8, "poly": [281.1294860839844, 1001.0513916015625, 689.37451171875, 1001.0513916015625, 689.37451171875, 1075.8765869140625, 281.1294860839844, 1075.8765869140625], "score": 0.9999961256980896}, {"category_id": 1, "poly": [133.53353881835938, 158.6427459716797, 836.7297973632812, 158.6427459716797, 836.7297973632812, 390.48828125, 133.53353881835938, 390.48828125], "score": 0.9999960660934448}, {"category_id": 8, "poly": [145.77777099609375, 1839.6416015625, 803.4192504882812, 1839.6416015625, 803.4192504882812, 1993.239013671875, 145.77777099609375, 1993.239013671875], "score": 0.9999958872795105}, {"category_id": 1, "poly": [864.9884643554688, 1420.8831787109375, 1567.3118896484375, 1420.8831787109375, 1567.3118896484375, 2023.257080078125, 864.9884643554688, 2023.257080078125], "score": 0.9999951124191284}, {"category_id": 9, "poly": [1529.267333984375, 388.6717834472656, 1565.1744384765625, 388.6717834472656, 1565.1744384765625, 416.4899597167969, 1529.267333984375, 416.4899597167969], "score": 0.9999918937683105}, {"category_id": 9, "poly": [800.3933715820312, 1551.524169921875, 833.2618408203125, 1551.524169921875, 833.2618408203125, 1582.073486328125, 800.3933715820312, 1582.073486328125], "score": 0.9999911189079285}, {"category_id": 1, "poly": [864.3720092773438, 200.97483825683594, 1565.6871337890625, 200.97483825683594, 1565.6871337890625, 365.6230163574219, 864.3720092773438, 365.6230163574219], "score": 0.9999903440475464}, {"category_id": 1, "poly": [134.87628173828125, 1369.5762939453125, 835.0336303710938, 1369.5762939453125, 835.0336303710938, 1533.884765625, 134.87628173828125, 1533.884765625], "score": 0.9999880790710449}, {"category_id": 1, "poly": [134.59988403320312, 444.5299377441406, 836.5606079101562, 444.5299377441406, 836.5606079101562, 709.0791015625, 134.59988403320312, 709.0791015625], "score": 0.999987006187439}, {"category_id": 1, "poly": [134.15472412109375, 1084.4288330078125, 836.2360229492188, 1084.4288330078125, 836.2360229492188, 1314.6600341796875, 134.15472412109375, 1314.6600341796875], "score": 0.9999866485595703}, {"category_id": 9, "poly": [800.6007690429688, 1023.1047973632812, 833.2154541015625, 1023.1047973632812, 833.2154541015625, 1055.7227783203125, 800.6007690429688, 1055.7227783203125], "score": 0.9999839663505554}, {"category_id": 8, "poly": [948.4016723632812, 372.03607177734375, 1486.11279296875, 372.03607177734375, 1486.11279296875, 449.3696594238281, 948.4016723632812, 449.3696594238281], "score": 0.9999831914901733}, {"category_id": 8, "poly": [145.31065368652344, 714.4036254882812, 820.3599853515625, 714.4036254882812, 820.3599853515625, 791.855712890625, 145.31065368652344, 791.855712890625], "score": 0.9999772906303406}, {"category_id": 1, "poly": [863.8760986328125, 599.6033325195312, 1566.84619140625, 599.6033325195312, 1566.84619140625, 797.44189453125, 863.8760986328125, 797.44189453125], "score": 0.999976396560669}, {"category_id": 1, "poly": [864.925537109375, 464.9669189453125, 1565.212158203125, 464.9669189453125, 1565.212158203125, 529.045654296875, 864.925537109375, 529.045654296875], "score": 0.999973475933075}, {"category_id": 1, "poly": [133.88735961914062, 797.7457885742188, 835.5986328125, 797.7457885742188, 835.5986328125, 994.4456176757812, 133.88735961914062, 994.4456176757812], "score": 0.9999661445617676}, {"category_id": 1, "poly": [134.8787841796875, 1615.116455078125, 835.4554443359375, 1615.116455078125, 835.4554443359375, 1815.4564208984375, 134.8787841796875, 1815.4564208984375], "score": 0.9999580383300781}, {"category_id": 9, "poly": [1530.1783447265625, 550.1576538085938, 1564.607177734375, 550.1576538085938, 1564.607177734375, 578.6950073242188, 1530.1783447265625, 578.6950073242188], "score": 0.9999532103538513}, {"category_id": 9, "poly": [801.0740966796875, 738.4259643554688, 834.7449340820312, 738.4259643554688, 834.7449340820312, 770.4969482421875, 801.0740966796875, 770.4969482421875], "score": 0.9996598958969116}, {"category_id": 0, "poly": [1134.302490234375, 815.6021728515625, 1295.3885498046875, 815.6021728515625, 1295.3885498046875, 844.6544799804688, 1134.302490234375, 844.6544799804688], "score": 0.9994980096817017}, {"category_id": 9, "poly": [798.6090698242188, 1986.7332763671875, 834.5460205078125, 1986.7332763671875, 834.5460205078125, 2017.6595458984375, 798.6090698242188, 2017.6595458984375], "score": 0.9992558360099792}, {"category_id": 0, "poly": [135.0093994140625, 406.12335205078125, 475.6328125, 406.12335205078125, 475.6328125, 437.4545593261719, 135.0093994140625, 437.4545593261719], "score": 0.9990860819816589}, {"category_id": 8, "poly": [1029.3924560546875, 541.857177734375, 1400.174072265625, 541.857177734375, 1400.174072265625, 585.1640625, 1029.3924560546875, 585.1640625], "score": 0.9979717135429382}, {"category_id": 0, "poly": [133.26077270507812, 1330.139892578125, 713.5426635742188, 1330.139892578125, 713.5426635742188, 1363.1341552734375, 133.26077270507812, 1363.1341552734375], "score": 0.9967154860496521}, {"category_id": 8, "poly": [338.6681823730469, 1547.7218017578125, 626.6519775390625, 1547.7218017578125, 626.6519775390625, 1604.587646484375, 338.6681823730469, 1604.587646484375], "score": 0.9945433139801025}, {"category_id": 1, "poly": [864.5469970703125, 160.16702270507812, 1251.313720703125, 160.16702270507812, 1251.313720703125, 190.15760803222656, 864.5469970703125, 190.15760803222656], "score": 0.9902143478393555}, {"category_id": 13, "poly": [550, 577, 648, 577, 648, 612, 550, 612], "score": 0.95, "latex": "C_{a}(p,\\bar{p})"}, {"category_id": 13, "poly": [183, 1780, 304, 1780, 304, 1813, 183, 1813], "score": 0.95, "latex": "p^{\\prime}=m(\\bar{p})"}, {"category_id": 14, "poly": [279, 1000, 687, 1000, 687, 1078, 279, 1078], "score": 0.95, "latex": "w_{t}(p,p_{t-1})=\\exp\\bigg({-\\frac{\\Delta_{c}(p,p_{t-1})}{\\gamma_{t}}}\\bigg),"}, {"category_id": 14, "poly": [147, 1843, 820, 1843, 820, 1992, 147, 1992], "score": 0.94, "latex": "F_{p}=\\left\\{\\begin{array}{l l}{\\underset{\\bar{p}\\in S_{p}\\setminus m(p)}{\\mathrm{min}}\\,C(p,\\bar{p})-\\underset{\\bar{p}\\in S_{p}}{\\mathrm{min}}\\,C(p,\\bar{p})}\\\\ {\\underset{\\bar{p}\\in S_{p}\\setminus m(p)}{\\mathrm{min}}\\,C(p,\\bar{p})}&{|d_{p}-d_{p^{\\prime}}|\\leq1}\\\\ {0,}&{\\mathrm{otherwise}}\\end{array}\\right.."}, {"category_id": 14, "poly": [340, 1546, 628, 1546, 628, 1608, 340, 1608], "score": 0.93, "latex": "m(p)=\\underset{\\bar{p}\\in S_{p}}{\\mathrm{argmin}}\\,C(p,\\bar{p})\\,."}, {"category_id": 13, "poly": [321, 830, 443, 830, 443, 864, 321, 864], "score": 0.93, "latex": "w_{t}(p,p_{t-1})"}, {"category_id": 13, "poly": [581, 1713, 694, 1713, 694, 1747, 581, 1747], "score": 0.93, "latex": "{\\bar{p}}=m(p)"}, {"category_id": 14, "poly": [947, 373, 1478, 373, 1478, 454, 947, 454], "score": 0.93, "latex": "\\Lambda^{i}(p,\\bar{p})=\\alpha\\times\\sum_{q\\in\\Omega_{p}}w(p,q)F_{q}^{i-1}\\left|D_{q}^{i-1}-d_{p}\\right|\\,,"}, {"category_id": 13, "poly": [426, 445, 512, 445, 512, 479, 426, 479], "score": 0.93, "latex": "C(p,{\\bar{p}})"}, {"category_id": 13, "poly": [337, 356, 414, 356, 414, 391, 337, 391], "score": 0.93, "latex": "\\mathcal{O}(\\omega^{2})"}, {"category_id": 13, "poly": [1341, 730, 1565, 730, 1565, 765, 1341, 765], "score": 0.92, "latex": "C_{a}(p,\\bar{p})\\gets C(p,\\bar{p})"}, {"category_id": 13, "poly": [629, 1436, 691, 1436, 691, 1470, 629, 1470], "score": 0.92, "latex": "m(p)"}, {"category_id": 13, "poly": [277, 1469, 361, 1469, 361, 1504, 277, 1504], "score": 0.92, "latex": "\\bar{p}\\in S_{p}"}, {"category_id": 14, "poly": [1030, 541, 1398, 541, 1398, 582, 1030, 582], "score": 0.92, "latex": "C^{i}(p,\\bar{p})=C^{0}(p,\\bar{p})+{\\Lambda^{i}}(p,\\bar{p})\\,,"}, {"category_id": 13, "poly": [453, 356, 518, 356, 518, 391, 453, 391], "score": 0.91, "latex": "\\mathcal{O}(\\omega)"}, {"category_id": 14, "poly": [146, 714, 787, 714, 787, 791, 146, 791], "score": 0.91, "latex": "C(p,\\bar{p})\\gets\\frac{(1-\\lambda)\\cdot C(p,\\bar{p})+\\lambda\\cdot w_{t}(p,p_{t-1})\\cdot C_{a}(p,\\bar{p})}{(1-\\lambda)+\\lambda\\cdot w_{t}(p,p_{t-1})},"}, {"category_id": 13, "poly": [1095, 231, 1134, 231, 1134, 270, 1095, 270], "score": 0.9, "latex": "D_{p}^{i}"}, {"category_id": 13, "poly": [1313, 1752, 1447, 1752, 1447, 1783, 1313, 1783], "score": 0.89, "latex": "640~\\times~480"}, {"category_id": 13, "poly": [593, 1782, 627, 1782, 627, 1815, 593, 1815], "score": 0.89, "latex": "F_{p}"}, {"category_id": 13, "poly": [133, 326, 209, 326, 209, 355, 133, 355], "score": 0.88, "latex": "\\omega\\times\\omega"}, {"category_id": 13, "poly": [208, 1089, 236, 1089, 236, 1116, 208, 1116], "score": 0.85, "latex": "\\gamma_{t}"}, {"category_id": 13, "poly": [1466, 769, 1484, 769, 1484, 797, 1466, 797], "score": 0.83, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [133, 935, 177, 935, 177, 963, 133, 963], "score": 0.83, "latex": "p_{t-1}"}, {"category_id": 13, "poly": [608, 1753, 627, 1753, 627, 1779, 608, 1779], "score": 0.81, "latex": "p"}, {"category_id": 13, "poly": [491, 799, 511, 799, 511, 825, 491, 825], "score": 0.81, "latex": "\\lambda"}, {"category_id": 13, "poly": [1018, 770, 1037, 770, 1037, 796, 1018, 796], "score": 0.81, "latex": "p"}, {"category_id": 13, "poly": [1086, 470, 1107, 470, 1107, 491, 1086, 491], "score": 0.8, "latex": "\\alpha"}, {"category_id": 13, "poly": [466, 901, 485, 901, 485, 929, 466, 929], "score": 0.8, "latex": "p"}, {"category_id": 13, "poly": [208, 484, 227, 484, 227, 511, 208, 511], "score": 0.79, "latex": "p"}, {"category_id": 13, "poly": [462, 1443, 480, 1443, 480, 1468, 462, 1468], "score": 0.77, "latex": "p"}, {"category_id": 13, "poly": [266, 514, 288, 514, 288, 544, 266, 544], "score": 0.77, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [816, 1716, 836, 1716, 836, 1746, 816, 1746], "score": 0.73, "latex": "\\bar{p}"}, {"category_id": 13, "poly": [132, 405, 154, 405, 154, 432, 132, 432], "score": 0.27, "latex": "B"}, {"category_id": 13, "poly": [862, 160, 887, 160, 887, 187, 862, 187], "score": 0.26, "latex": "D"}, {"category_id": 15, "poly": [887.0, 852.0, 1568.0, 855.0, 1568.0, 894.0, 887.0, 891.0], "score": 0.98, "text": " The speed and accuracy of real-time stereo matching al-"}, {"category_id": 15, "poly": [864.0, 891.0, 1566.0, 891.0, 1566.0, 924.0, 864.0, 924.0], "score": 0.99, "text": "gorithms are traditionally demonstrated using still-frame im-"}, {"category_id": 15, "poly": [859.0, 921.0, 1571.0, 919.0, 1571.0, 958.0, 859.0, 960.0], "score": 0.97, "text": " ages from the Middlebury stereo benchmark [1], [2]. Still"}, {"category_id": 15, "poly": [862.0, 956.0, 1568.0, 958.0, 1568.0, 990.0, 862.0, 988.0], "score": 0.99, "text": "frames, however, are insufficient for evaluating stereo match-"}, {"category_id": 15, "poly": [864.0, 992.0, 1571.0, 992.0, 1571.0, 1024.0, 864.0, 1024.0], "score": 1.0, "text": "ing algorithms that incorporate frame-to-frame prediction to"}, {"category_id": 15, "poly": [864.0, 1027.0, 1568.0, 1027.0, 1568.0, 1059.0, 864.0, 1059.0], "score": 0.97, "text": "enhance matching accuracy. An alternative approach is to"}, {"category_id": 15, "poly": [864.0, 1059.0, 1566.0, 1059.0, 1566.0, 1089.0, 864.0, 1089.0], "score": 0.99, "text": "use a stereo video sequence with a ground truth disparity"}, {"category_id": 15, "poly": [862.0, 1091.0, 1566.0, 1091.0, 1566.0, 1123.0, 862.0, 1123.0], "score": 1.0, "text": "for each frame. Obtaining the ground truth disparity of real"}, {"category_id": 15, "poly": [866.0, 1125.0, 1566.0, 1125.0, 1566.0, 1157.0, 866.0, 1157.0], "score": 0.98, "text": "world video sequences is a difficult undertaking due to the"}, {"category_id": 15, "poly": [859.0, 1153.0, 1568.0, 1155.0, 1568.0, 1194.0, 859.0, 1192.0], "score": 0.99, "text": "high frame rate of video and limitations in depth sensing-"}, {"category_id": 15, "poly": [864.0, 1192.0, 1568.0, 1192.0, 1568.0, 1224.0, 864.0, 1224.0], "score": 0.99, "text": "technology. To address the need for stereo video with ground"}, {"category_id": 15, "poly": [864.0, 1224.0, 1568.0, 1224.0, 1568.0, 1256.0, 864.0, 1256.0], "score": 0.99, "text": "truth disparities, five pairs of synthetic stereo video sequences"}, {"category_id": 15, "poly": [864.0, 1258.0, 1568.0, 1258.0, 1568.0, 1290.0, 864.0, 1290.0], "score": 0.99, "text": "of a computer-generated scene were given in [19]. While these"}, {"category_id": 15, "poly": [864.0, 1290.0, 1566.0, 1290.0, 1566.0, 1322.0, 864.0, 1322.0], "score": 1.0, "text": "videos incorporate a sufficient amount of movement variation,"}, {"category_id": 15, "poly": [862.0, 1325.0, 1568.0, 1325.0, 1568.0, 1357.0, 862.0, 1357.0], "score": 0.99, "text": "they were generated from relatively simple models using low-"}, {"category_id": 15, "poly": [862.0, 1359.0, 1571.0, 1359.0, 1571.0, 1389.0, 862.0, 1389.0], "score": 0.99, "text": "resolution rendering, and they do not provide occlusion or"}, {"category_id": 15, "poly": [862.0, 1386.0, 1088.0, 1394.0, 1087.0, 1426.0, 861.0, 1418.0], "score": 0.98, "text": "discontinuity maps."}, {"category_id": 15, "poly": [129.0, 156.0, 839.0, 158.0, 838.0, 197.0, 129.0, 195.0], "score": 0.99, "text": "the matching cost by performing two-pass aggregation using"}, {"category_id": 15, "poly": [130.0, 188.0, 841.0, 193.0, 841.0, 229.0, 129.0, 225.0], "score": 0.98, "text": "two orthogonal 1D windows [5], [6], [8]. The two-pass method "}, {"category_id": 15, "poly": [129.0, 225.0, 841.0, 222.0, 841.0, 261.0, 129.0, 264.0], "score": 0.99, "text": "first aggregates matching costs in the vertical direction, and"}, {"category_id": 15, "poly": [134.0, 261.0, 838.0, 261.0, 838.0, 293.0, 134.0, 293.0], "score": 0.99, "text": "then computes a weighted sum of the aggregated costs in the"}, {"category_id": 15, "poly": [132.0, 291.0, 838.0, 291.0, 838.0, 330.0, 132.0, 330.0], "score": 0.99, "text": "horizontal direction. Given that support regions are of size"}, {"category_id": 15, "poly": [136.0, 360.0, 336.0, 360.0, 336.0, 392.0, 136.0, 392.0], "score": 0.99, "text": "aggregation from"}, {"category_id": 15, "poly": [415.0, 360.0, 452.0, 360.0, 452.0, 392.0, 415.0, 392.0], "score": 0.98, "text": "to"}, {"category_id": 15, "poly": [210.0, 321.0, 836.0, 321.0, 836.0, 360.0, 210.0, 360.0], "score": 0.98, "text": ", the two-pass method reduces the complexity of cost"}, {"category_id": 15, "poly": [887.0, 1416.0, 1571.0, 1419.0, 1571.0, 1458.0, 887.0, 1455.0], "score": 0.98, "text": " To evaluate the performance of temporal aggregation, a"}, {"category_id": 15, "poly": [862.0, 1453.0, 1566.0, 1453.0, 1566.0, 1485.0, 862.0, 1485.0], "score": 0.98, "text": "new synthetic stereo video sequence is introduced along with"}, {"category_id": 15, "poly": [862.0, 1490.0, 1566.0, 1487.0, 1566.0, 1519.0, 862.0, 1522.0], "score": 0.99, "text": "corresponding disparity maps, occlusion maps, and disconti-"}, {"category_id": 15, "poly": [862.0, 1519.0, 1571.0, 1519.0, 1571.0, 1558.0, 862.0, 1558.0], "score": 0.99, "text": "nuity maps for evaluating the performance of temporal stereo"}, {"category_id": 15, "poly": [864.0, 1556.0, 1568.0, 1556.0, 1568.0, 1588.0, 864.0, 1588.0], "score": 1.0, "text": "matching algorithms. To create the video sequence, a complex"}, {"category_id": 15, "poly": [864.0, 1590.0, 1568.0, 1590.0, 1568.0, 1620.0, 864.0, 1620.0], "score": 0.99, "text": "scene was constructed using Google Sketchup and a pair"}, {"category_id": 15, "poly": [864.0, 1622.0, 1568.0, 1622.0, 1568.0, 1655.0, 864.0, 1655.0], "score": 0.99, "text": "of animated paths were rendered photorealistically using the"}, {"category_id": 15, "poly": [859.0, 1650.0, 1571.0, 1652.0, 1571.0, 1691.0, 859.0, 1689.0], "score": 0.99, "text": " Kerkythea rendering software. Realistic material properties"}, {"category_id": 15, "poly": [864.0, 1689.0, 1566.0, 1689.0, 1566.0, 1721.0, 864.0, 1721.0], "score": 1.0, "text": "were used to give surfaces a natural-looking appearance by"}, {"category_id": 15, "poly": [864.0, 1723.0, 1566.0, 1723.0, 1566.0, 1755.0, 864.0, 1755.0], "score": 0.98, "text": "adjusting their specularity, reflectance, and diffusion. The"}, {"category_id": 15, "poly": [864.0, 1788.0, 1568.0, 1788.0, 1568.0, 1820.0, 864.0, 1820.0], "score": 1.0, "text": "frame rate of 30 frames per second, and a duration of 4"}, {"category_id": 15, "poly": [862.0, 1817.0, 1568.0, 1820.0, 1568.0, 1859.0, 861.0, 1856.0], "score": 0.98, "text": "seconds. In addition to performing photorealistic rendering."}, {"category_id": 15, "poly": [864.0, 1856.0, 1568.0, 1856.0, 1568.0, 1888.0, 864.0, 1888.0], "score": 0.99, "text": "depth renders of both video sequences were also generated and"}, {"category_id": 15, "poly": [864.0, 1888.0, 1566.0, 1888.0, 1566.0, 1920.0, 864.0, 1920.0], "score": 0.98, "text": "converted to ground truth disparity for the stereo video. The"}, {"category_id": 15, "poly": [862.0, 1920.0, 1564.0, 1920.0, 1564.0, 1952.0, 862.0, 1952.0], "score": 0.99, "text": "video sequences and ground truth data have been made avail-"}, {"category_id": 15, "poly": [862.0, 1950.0, 1566.0, 1953.0, 1566.0, 1985.0, 862.0, 1982.0], "score": 0.99, "text": "able at http://mc2.unl.edu/current-research"}, {"category_id": 15, "poly": [866.0, 1989.0, 1566.0, 1989.0, 1566.0, 2019.0, 866.0, 2019.0], "score": 0.98, "text": "/ image-processing/. Figure 2 shows two sample frames"}, {"category_id": 15, "poly": [862.0, 1755.0, 1312.0, 1755.0, 1312.0, 1788.0, 862.0, 1788.0], "score": 0.97, "text": "video sequence has a resolution of "}, {"category_id": 15, "poly": [1448.0, 1755.0, 1566.0, 1755.0, 1566.0, 1788.0, 1448.0, 1788.0], "score": 0.99, "text": "pixels,a"}, {"category_id": 15, "poly": [889.0, 197.0, 1566.0, 199.0, 1566.0, 238.0, 889.0, 236.0], "score": 1.0, "text": "Once the first iteration of stereo matching is complete,"}, {"category_id": 15, "poly": [864.0, 268.0, 1566.0, 268.0, 1566.0, 300.0, 864.0, 300.0], "score": 0.99, "text": "subsequent iterations. This is done by penalizing disparities"}, {"category_id": 15, "poly": [864.0, 302.0, 1568.0, 302.0, 1568.0, 335.0, 864.0, 335.0], "score": 1.0, "text": "that deviate from their expected values. The penalty function"}, {"category_id": 15, "poly": [862.0, 337.0, 996.0, 337.0, 996.0, 369.0, 862.0, 369.0], "score": 0.97, "text": "is given by"}, {"category_id": 15, "poly": [864.0, 236.0, 1094.0, 236.0, 1094.0, 268.0, 864.0, 268.0], "score": 0.96, "text": "disparityestimates"}, {"category_id": 15, "poly": [1135.0, 236.0, 1568.0, 236.0, 1568.0, 268.0, 1135.0, 268.0], "score": 0.97, "text": " can be used to guide matching in"}, {"category_id": 15, "poly": [157.0, 1366.0, 839.0, 1368.0, 838.0, 1407.0, 157.0, 1405.0], "score": 1.0, "text": "Having performed temporal cost aggregation, matches are"}, {"category_id": 15, "poly": [134.0, 1405.0, 834.0, 1405.0, 834.0, 1437.0, 134.0, 1437.0], "score": 0.99, "text": "determined using the Winner-Takes-All (WTA) match selec-"}, {"category_id": 15, "poly": [132.0, 1506.0, 374.0, 1506.0, 374.0, 1538.0, 132.0, 1538.0], "score": 1.0, "text": "cost, and is given by"}, {"category_id": 15, "poly": [692.0, 1439.0, 834.0, 1439.0, 834.0, 1471.0, 692.0, 1471.0], "score": 0.99, "text": ", is the can-"}, {"category_id": 15, "poly": [134.0, 1474.0, 276.0, 1474.0, 276.0, 1506.0, 134.0, 1506.0], "score": 0.98, "text": "didate pixel"}, {"category_id": 15, "poly": [362.0, 1474.0, 836.0, 1474.0, 836.0, 1506.0, 362.0, 1506.0], "score": 0.99, "text": " characterized by the minimum matching"}, {"category_id": 15, "poly": [134.0, 1439.0, 461.0, 1439.0, 461.0, 1471.0, 134.0, 1471.0], "score": 1.0, "text": "tion criteria. The match for"}, {"category_id": 15, "poly": [481.0, 1439.0, 628.0, 1439.0, 628.0, 1471.0, 481.0, 1471.0], "score": 0.96, "text": ", denoted as"}, {"category_id": 15, "poly": [134.0, 548.0, 838.0, 545.0, 838.0, 577.0, 134.0, 580.0], "score": 0.99, "text": "aggregation routine is exectuted. At each time instance, the"}, {"category_id": 15, "poly": [134.0, 614.0, 834.0, 614.0, 834.0, 646.0, 134.0, 646.0], "score": 1.0, "text": "weighted summation of costs obtained in the previous frames."}, {"category_id": 15, "poly": [132.0, 646.0, 838.0, 644.0, 838.0, 676.0, 132.0, 678.0], "score": 1.0, "text": "During temporal aggregation, the auxiliary cost is merged with"}, {"category_id": 15, "poly": [132.0, 678.0, 675.0, 681.0, 674.0, 713.0, 132.0, 710.0], "score": 0.99, "text": "the cost obtained from the current frame using"}, {"category_id": 15, "poly": [134.0, 580.0, 549.0, 580.0, 549.0, 612.0, 134.0, 612.0], "score": 1.0, "text": "algorithm stores an auxiliary cost"}, {"category_id": 15, "poly": [649.0, 580.0, 841.0, 580.0, 841.0, 612.0, 649.0, 612.0], "score": 0.96, "text": "which  holds  a"}, {"category_id": 15, "poly": [157.0, 445.0, 425.0, 442.0, 425.0, 481.0, 157.0, 484.0], "score": 0.98, "text": " Once aggregated costs"}, {"category_id": 15, "poly": [513.0, 445.0, 838.0, 442.0, 838.0, 481.0, 513.0, 484.0], "score": 0.96, "text": " have been computed for all"}, {"category_id": 15, "poly": [132.0, 481.0, 207.0, 481.0, 207.0, 513.0, 132.0, 513.0], "score": 1.0, "text": "pixels"}, {"category_id": 15, "poly": [228.0, 481.0, 838.0, 481.0, 838.0, 513.0, 228.0, 513.0], "score": 0.97, "text": " in the reference image and their respective matching"}, {"category_id": 15, "poly": [134.0, 516.0, 265.0, 516.0, 265.0, 548.0, 134.0, 548.0], "score": 1.0, "text": "candidates"}, {"category_id": 15, "poly": [289.0, 516.0, 838.0, 516.0, 838.0, 548.0, 289.0, 548.0], "score": 0.98, "text": " in the target image, a single-pass temporal"}, {"category_id": 15, "poly": [132.0, 1116.0, 841.0, 1116.0, 841.0, 1155.0, 132.0, 1155.0], "score": 0.99, "text": "in the temporal dimension. The temporal adaptive weight has "}, {"category_id": 15, "poly": [134.0, 1153.0, 838.0, 1153.0, 838.0, 1185.0, 134.0, 1185.0], "score": 0.99, "text": "the effect of preserving edges in the temporal domain, such"}, {"category_id": 15, "poly": [132.0, 1182.0, 836.0, 1182.0, 836.0, 1215.0, 132.0, 1215.0], "score": 0.98, "text": "that when a pixel coordinate transitions from one side of an"}, {"category_id": 15, "poly": [134.0, 1219.0, 838.0, 1219.0, 838.0, 1251.0, 134.0, 1251.0], "score": 0.98, "text": "edge to another in subsequent frames, the auxiliary cost is"}, {"category_id": 15, "poly": [134.0, 1254.0, 838.0, 1254.0, 838.0, 1283.0, 134.0, 1283.0], "score": 0.99, "text": "assigned a small weight and the majority of the cost is derived"}, {"category_id": 15, "poly": [130.0, 1283.0, 404.0, 1286.0, 404.0, 1318.0, 129.0, 1315.0], "score": 1.0, "text": "from the current frame."}, {"category_id": 15, "poly": [134.0, 1086.0, 207.0, 1086.0, 207.0, 1118.0, 134.0, 1118.0], "score": 0.99, "text": "where"}, {"category_id": 15, "poly": [237.0, 1086.0, 836.0, 1086.0, 836.0, 1118.0, 237.0, 1118.0], "score": 0.99, "text": "regulates the strength of grouping by color similarity"}, {"category_id": 15, "poly": [864.0, 600.0, 1568.0, 600.0, 1568.0, 632.0, 864.0, 632.0], "score": 1.0, "text": "and the matches are reselected using the WTA match selection"}, {"category_id": 15, "poly": [864.0, 635.0, 1568.0, 635.0, 1568.0, 667.0, 864.0, 667.0], "score": 0.99, "text": "criteria. The resulting disparity maps are then post-processed"}, {"category_id": 15, "poly": [864.0, 669.0, 1564.0, 669.0, 1564.0, 699.0, 864.0, 699.0], "score": 0.98, "text": "using a combination of median filtering and occlusion filling."}, {"category_id": 15, "poly": [864.0, 701.0, 1566.0, 701.0, 1566.0, 731.0, 864.0, 731.0], "score": 0.98, "text": "Finally, the current cost becomes the auxiliary cost for the next"}, {"category_id": 15, "poly": [862.0, 731.0, 1340.0, 731.0, 1340.0, 770.0, 862.0, 770.0], "score": 0.99, "text": "pair of frames in the video sequence, i.e.,"}, {"category_id": 15, "poly": [864.0, 768.0, 1017.0, 768.0, 1017.0, 800.0, 864.0, 800.0], "score": 1.0, "text": "for all pixels"}, {"category_id": 15, "poly": [1038.0, 768.0, 1465.0, 768.0, 1465.0, 800.0, 1038.0, 800.0], "score": 0.98, "text": " in the and their matching candidates"}, {"category_id": 15, "poly": [864.0, 502.0, 1427.0, 502.0, 1427.0, 532.0, 864.0, 532.0], "score": 1.0, "text": "values are incorporated into the matching cost as"}, {"category_id": 15, "poly": [864.0, 468.0, 1085.0, 468.0, 1085.0, 500.0, 864.0, 500.0], "score": 0.96, "text": "where the value of"}, {"category_id": 15, "poly": [1108.0, 468.0, 1564.0, 468.0, 1564.0, 500.0, 1108.0, 500.0], "score": 0.99, "text": "is chosen empirically. Next, the penalty"}, {"category_id": 15, "poly": [134.0, 866.0, 838.0, 866.0, 838.0, 898.0, 134.0, 898.0], "score": 0.99, "text": "temporal domain. The temporal adaptive weight computed"}, {"category_id": 15, "poly": [132.0, 967.0, 263.0, 967.0, 263.0, 999.0, 132.0, 999.0], "score": 0.93, "text": "is given by"}, {"category_id": 15, "poly": [134.0, 834.0, 320.0, 834.0, 320.0, 866.0, 134.0, 866.0], "score": 0.97, "text": "smoothing  and"}, {"category_id": 15, "poly": [444.0, 834.0, 836.0, 834.0, 836.0, 866.0, 444.0, 866.0], "score": 0.92, "text": " enforces  color  similarity  in  the"}, {"category_id": 15, "poly": [178.0, 930.0, 838.0, 928.0, 839.0, 967.0, 178.0, 969.0], "score": 0.99, "text": ", located at the same spatial coordinate in the prior frame,"}, {"category_id": 15, "poly": [132.0, 795.0, 490.0, 800.0, 490.0, 832.0, 132.0, 827.0], "score": 0.99, "text": "where the feedback coefficient"}, {"category_id": 15, "poly": [512.0, 795.0, 836.0, 800.0, 836.0, 832.0, 512.0, 827.0], "score": 0.97, "text": " controls the amount of cost"}, {"category_id": 15, "poly": [136.0, 898.0, 465.0, 898.0, 465.0, 930.0, 136.0, 930.0], "score": 0.99, "text": "between the pixel of interest"}, {"category_id": 15, "poly": [486.0, 898.0, 838.0, 898.0, 838.0, 930.0, 486.0, 930.0], "score": 1.0, "text": "in the current frame and pixel"}, {"category_id": 15, "poly": [159.0, 1616.0, 836.0, 1616.0, 836.0, 1648.0, 159.0, 1648.0], "score": 0.99, "text": "To asses the level of confidence associated with selecting"}, {"category_id": 15, "poly": [132.0, 1648.0, 836.0, 1650.0, 836.0, 1682.0, 132.0, 1680.0], "score": 1.0, "text": "minimum cost matches, the algorithm determines another set"}, {"category_id": 15, "poly": [134.0, 1684.0, 838.0, 1684.0, 838.0, 1716.0, 134.0, 1716.0], "score": 1.0, "text": "of matches, this time from the target to reference image, and"}, {"category_id": 15, "poly": [134.0, 1783.0, 182.0, 1783.0, 182.0, 1815.0, 134.0, 1815.0], "score": 1.0, "text": "and"}, {"category_id": 15, "poly": [136.0, 1714.0, 580.0, 1714.0, 580.0, 1746.0, 136.0, 1746.0], "score": 0.98, "text": "verifies if the results agree. Given that"}, {"category_id": 15, "poly": [305.0, 1783.0, 592.0, 1783.0, 592.0, 1815.0, 305.0, 1815.0], "score": 0.99, "text": ", the confidence measure"}, {"category_id": 15, "poly": [628.0, 1783.0, 811.0, 1783.0, 811.0, 1815.0, 628.0, 1815.0], "score": 0.97, "text": "is computed as"}, {"category_id": 15, "poly": [132.0, 1746.0, 607.0, 1751.0, 607.0, 1783.0, 132.0, 1778.0], "score": 1.0, "text": "in the right image is the match for pixel"}, {"category_id": 15, "poly": [628.0, 1746.0, 836.0, 1751.0, 836.0, 1783.0, 628.0, 1778.0], "score": 0.98, "text": "in the left image,"}, {"category_id": 15, "poly": [695.0, 1714.0, 815.0, 1714.0, 815.0, 1746.0, 695.0, 1746.0], "score": 0.99, "text": ", i.e. pixel"}, {"category_id": 15, "poly": [1132.0, 814.0, 1298.0, 814.0, 1298.0, 852.0, 1132.0, 852.0], "score": 1.0, "text": "IV. RESULTS"}, {"category_id": 15, "poly": [155.0, 401.0, 481.0, 406.0, 480.0, 445.0, 155.0, 440.0], "score": 0.99, "text": "Temporal cost aggregation"}, {"category_id": 15, "poly": [129.0, 1325.0, 718.0, 1327.0, 718.0, 1366.0, 129.0, 1363.0], "score": 0.99, "text": "C. Disparity Selection and Confidence Assessment"}, {"category_id": 15, "poly": [888.0, 158.0, 1252.0, 158.0, 1252.0, 197.0, 888.0, 197.0], "score": 0.97, "text": "Iterative Disparity Refinement"}], "page_info": {"page_no": 2, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 1, "poly": [133.2669677734375, 156.7020721435547, 840.6729125976562, 156.7020721435547, 840.6729125976562, 257.75836181640625, 133.2669677734375, 257.75836181640625], "score": 0.9999951124191284}, {"category_id": 3, "poly": [866.177734375, 171.2958526611328, 1510.944580078125, 171.2958526611328, 1510.944580078125, 848.8190307617188, 866.177734375, 848.8190307617188], "score": 0.9999942779541016}, {"category_id": 1, "poly": [131.3756561279297, 1520.5887451171875, 838.545166015625, 1520.5887451171875, 838.545166015625, 1885.353515625, 131.3756561279297, 1885.353515625], "score": 0.9999925494194031}, {"category_id": 4, "poly": [131.56919860839844, 1352.6187744140625, 840.1758422851562, 1352.6187744140625, 840.1758422851562, 1490.513671875, 131.56919860839844, 1490.513671875], "score": 0.9999915361404419}, {"category_id": 1, "poly": [132.41786193847656, 1886.0615234375, 838.675537109375, 1886.0615234375, 838.675537109375, 2019.347412109375, 132.41786193847656, 2019.347412109375], "score": 0.9999526739120483}, {"category_id": 3, "poly": [136.71240234375, 278.259765625, 816.1984252929688, 278.259765625, 816.1984252929688, 1348.5758056640625, 136.71240234375, 1348.5758056640625], "score": 0.9999439120292664}, {"category_id": 1, "poly": [863.4852905273438, 1917.056884765625, 1569.6337890625, 1917.056884765625, 1569.6337890625, 2020.57421875, 863.4852905273438, 2020.57421875], "score": 0.9999344348907471}, {"category_id": 4, "poly": [861.7813720703125, 1749.4459228515625, 1567.659912109375, 1749.4459228515625, 1567.659912109375, 1852.389892578125, 861.7813720703125, 1852.389892578125], "score": 0.9986151456832886}, {"category_id": 3, "poly": [874.6467895507812, 1536.7642822265625, 1506.6514892578125, 1536.7642822265625, 1506.6514892578125, 1734.9659423828125, 874.6467895507812, 1734.9659423828125], "score": 0.9940656423568726}, {"category_id": 4, "poly": [859.3250122070312, 861.2320556640625, 1569.650634765625, 861.2320556640625, 1569.650634765625, 1033.0804443359375, 859.3250122070312, 1033.0804443359375], "score": 0.985899806022644}, {"category_id": 1, "poly": [861.6172485351562, 1064.186279296875, 1564.036865234375, 1064.186279296875, 1564.036865234375, 1135.5125732421875, 861.6172485351562, 1135.5125732421875], "score": 0.9128350019454956}, {"category_id": 3, "poly": [888.8074340820312, 1163.7965087890625, 1529.8028564453125, 1163.7965087890625, 1529.8028564453125, 1510.91162109375, 888.8074340820312, 1510.91162109375], "score": 0.7896175384521484}, {"category_id": 5, "poly": [900.75146484375, 1161.0631103515625, 1527.15673828125, 1161.0631103515625, 1527.15673828125, 1490.2149658203125, 900.75146484375, 1490.2149658203125], "score": 0.7772396802902222}, {"category_id": 0, "poly": [1178.85791015625, 152.25347900390625, 1284.6339111328125, 152.25347900390625, 1284.6339111328125, 179.1011962890625, 1178.85791015625, 179.1011962890625], "score": 0.5732811689376831}, {"category_id": 4, "poly": [1178.981689453125, 152.21678161621094, 1284.4158935546875, 152.21678161621094, 1284.4158935546875, 179.05447387695312, 1178.981689453125, 179.05447387695312], "score": 0.4503781795501709}, {"category_id": 13, "poly": [1295, 896, 1483, 896, 1483, 931, 1295, 931], "score": 0.93, "latex": "\\{\\pm0,\\pm20,\\pm40\\}"}, {"category_id": 13, "poly": [481, 1919, 534, 1919, 534, 1949, 481, 1949], "score": 0.87, "latex": "\\pm20"}, {"category_id": 13, "poly": [591, 1919, 644, 1919, 644, 1949, 591, 1949], "score": 0.87, "latex": "\\pm40"}, {"category_id": 13, "poly": [1227, 1436, 1253, 1436, 1253, 1459, 1227, 1459], "score": 0.86, "latex": "\\gamma_{c}"}, {"category_id": 13, "poly": [1295, 1436, 1323, 1436, 1323, 1461, 1295, 1461], "score": 0.85, "latex": "\\gamma_{g}"}, {"category_id": 13, "poly": [133, 1588, 186, 1588, 186, 1618, 133, 1618], "score": 0.85, "latex": "\\pm20"}, {"category_id": 13, "poly": [249, 1587, 302, 1587, 302, 1618, 249, 1618], "score": 0.84, "latex": "\\pm40"}, {"category_id": 13, "poly": [787, 1555, 828, 1555, 828, 1585, 787, 1585], "score": 0.82, "latex": "\\pm0"}, {"category_id": 13, "poly": [532, 1421, 572, 1421, 572, 1452, 532, 1452], "score": 0.81, "latex": "3^{\\mathrm{rd}}"}, {"category_id": 13, "poly": [230, 1389, 266, 1389, 266, 1419, 230, 1419], "score": 0.8, "latex": "1^{\\mathrm{st}}"}, {"category_id": 13, "poly": [655, 1986, 675, 1986, 675, 2013, 655, 2013], "score": 0.78, "latex": "\\lambda"}, {"category_id": 13, "poly": [200, 1455, 240, 1455, 240, 1486, 200, 1486], "score": 0.75, "latex": "4^{\\mathrm{th}}"}, {"category_id": 13, "poly": [954, 1255, 980, 1255, 980, 1275, 954, 1275], "score": 0.75, "latex": "\\gamma_{c}"}, {"category_id": 13, "poly": [954, 1281, 980, 1281, 980, 1302, 954, 1302], "score": 0.74, "latex": "\\gamma_{g}"}, {"category_id": 13, "poly": [959, 1227, 976, 1227, 976, 1245, 959, 1245], "score": 0.74, "latex": "\\tau"}, {"category_id": 13, "poly": [960, 1352, 976, 1352, 976, 1372, 960, 1372], "score": 0.72, "latex": "k"}, {"category_id": 13, "poly": [410, 1986, 430, 1986, 430, 2013, 410, 2013], "score": 0.7, "latex": "\\lambda"}, {"category_id": 13, "poly": [955, 1331, 979, 1331, 979, 1351, 955, 1351], "score": 0.7, "latex": "\\gamma_{t}"}, {"category_id": 13, "poly": [1489, 1752, 1510, 1752, 1510, 1778, 1489, 1778], "score": 0.69, "latex": "\\lambda"}, {"category_id": 13, "poly": [1176, 965, 1195, 965, 1195, 992, 1176, 992], "score": 0.69, "latex": "\\lambda"}, {"category_id": 13, "poly": [246, 1421, 289, 1421, 289, 1452, 246, 1452], "score": 0.69, "latex": "2^{\\mathrm{nd}}"}, {"category_id": 13, "poly": [958, 1302, 977, 1302, 977, 1323, 958, 1323], "score": 0.63, "latex": "\\lambda"}, {"category_id": 13, "poly": [959, 1380, 977, 1380, 977, 1397, 959, 1397], "score": 0.58, "latex": "\\alpha"}, {"category_id": 13, "poly": [436, 1621, 455, 1621, 455, 1648, 436, 1648], "score": 0.58, "latex": "\\lambda"}, {"category_id": 13, "poly": [959, 1204, 977, 1204, 977, 1219, 959, 1219], "score": 0.42, "latex": "\\omega"}, {"category_id": 13, "poly": [870, 1592, 890, 1592, 890, 1617, 870, 1617], "score": 0.31, "latex": "\\lambda"}, {"category_id": 15, "poly": [134.0, 160.0, 836.0, 160.0, 836.0, 192.0, 134.0, 192.0], "score": 0.99, "text": "of the synthetic stereo scene from a single camera perspective,"}, {"category_id": 15, "poly": [134.0, 195.0, 838.0, 195.0, 838.0, 227.0, 134.0, 227.0], "score": 0.99, "text": "along with the ground truth disparity, occlusion map, and"}, {"category_id": 15, "poly": [130.0, 222.0, 347.0, 230.0, 346.0, 264.0, 129.0, 256.0], "score": 0.99, "text": "discontinuity map."}, {"category_id": 15, "poly": [155.0, 1517.0, 841.0, 1519.0, 841.0, 1558.0, 155.0, 1556.0], "score": 0.99, "text": " The results of temporal stereo matching are given in Figure"}, {"category_id": 15, "poly": [132.0, 1657.0, 838.0, 1657.0, 838.0, 1689.0, 132.0, 1689.0], "score": 0.99, "text": "stereo matching methods, improvements are negligible when"}, {"category_id": 15, "poly": [132.0, 1691.0, 838.0, 1691.0, 838.0, 1723.0, 132.0, 1723.0], "score": 0.99, "text": "no noise is added to the images [10], [19]. This is largely due"}, {"category_id": 15, "poly": [132.0, 1723.0, 836.0, 1723.0, 836.0, 1753.0, 132.0, 1753.0], "score": 0.98, "text": "to the fact that the video used to evaluate these methods is"}, {"category_id": 15, "poly": [129.0, 1753.0, 838.0, 1751.0, 839.0, 1790.0, 129.0, 1792.0], "score": 0.99, "text": " computer generated with very little noise to start with, thus"}, {"category_id": 15, "poly": [134.0, 1790.0, 836.0, 1790.0, 836.0, 1822.0, 134.0, 1822.0], "score": 0.99, "text": "the noise suppression achieved with temporal stereo matching"}, {"category_id": 15, "poly": [132.0, 1817.0, 839.0, 1822.0, 838.0, 1859.0, 132.0, 1854.0], "score": 0.99, "text": "shows little to no improvement over methods that operate on"}, {"category_id": 15, "poly": [130.0, 1856.0, 319.0, 1859.0, 318.0, 1891.0, 129.0, 1888.0], "score": 0.99, "text": "pairs of images."}, {"category_id": 15, "poly": [187.0, 1590.0, 248.0, 1590.0, 248.0, 1622.0, 187.0, 1622.0], "score": 0.87, "text": ",and"}, {"category_id": 15, "poly": [303.0, 1590.0, 838.0, 1590.0, 838.0, 1622.0, 303.0, 1622.0], "score": 0.98, "text": ". Each performance plot is given as a function"}, {"category_id": 15, "poly": [127.0, 1551.0, 786.0, 1554.0, 786.0, 1593.0, 127.0, 1590.0], "score": 0.98, "text": " 3 for uniform additive noise confined to the ranges of"}, {"category_id": 15, "poly": [134.0, 1622.0, 435.0, 1622.0, 435.0, 1655.0, 134.0, 1655.0], "score": 0.99, "text": "of the feedback coefficient"}, {"category_id": 15, "poly": [456.0, 1622.0, 836.0, 1622.0, 836.0, 1655.0, 456.0, 1655.0], "score": 0.97, "text": ". As with the majority of temporal"}, {"category_id": 15, "poly": [134.0, 1359.0, 834.0, 1359.0, 834.0, 1391.0, 134.0, 1391.0], "score": 0.99, "text": "Figure 2: Two sample frames from the synthetic video se-"}, {"category_id": 15, "poly": [573.0, 1418.0, 836.0, 1421.0, 836.0, 1460.0, 573.0, 1457.0], "score": 1.0, "text": "row), and discontinuity"}, {"category_id": 15, "poly": [134.0, 1393.0, 229.0, 1393.0, 229.0, 1425.0, 134.0, 1425.0], "score": 0.96, "text": "quence ("}, {"category_id": 15, "poly": [267.0, 1393.0, 836.0, 1393.0, 836.0, 1425.0, 267.0, 1425.0], "score": 0.98, "text": "row), along with their corresponding ground truth"}, {"category_id": 15, "poly": [127.0, 1456.0, 199.0, 1450.0, 199.0, 1489.0, 128.0, 1495.0], "score": 0.91, "text": "map ("}, {"category_id": 15, "poly": [241.0, 1456.0, 309.0, 1450.0, 310.0, 1489.0, 241.0, 1495.0], "score": 1.0, "text": "row)."}, {"category_id": 15, "poly": [129.0, 1418.0, 245.0, 1421.0, 245.0, 1460.0, 129.0, 1457.0], "score": 0.93, "text": " disparity "}, {"category_id": 15, "poly": [290.0, 1418.0, 531.0, 1421.0, 531.0, 1460.0, 290.0, 1457.0], "score": 1.0, "text": "row), occlusion map ("}, {"category_id": 15, "poly": [159.0, 1888.0, 836.0, 1888.0, 836.0, 1920.0, 159.0, 1920.0], "score": 0.99, "text": " Significant improvements in accuracy can be seen in Figure"}, {"category_id": 15, "poly": [132.0, 1950.0, 839.0, 1955.0, 838.0, 1987.0, 132.0, 1982.0], "score": 1.0, "text": "the effect of noise in the current frame is reduced by increasing"}, {"category_id": 15, "poly": [134.0, 1920.0, 480.0, 1920.0, 480.0, 1952.0, 134.0, 1952.0], "score": 0.99, "text": "3 when the noise has ranges of"}, {"category_id": 15, "poly": [535.0, 1920.0, 590.0, 1920.0, 590.0, 1952.0, 535.0, 1952.0], "score": 0.92, "text": " and"}, {"category_id": 15, "poly": [645.0, 1920.0, 836.0, 1920.0, 836.0, 1952.0, 645.0, 1952.0], "score": 0.96, "text": ". In this scenario,"}, {"category_id": 15, "poly": [676.0, 1989.0, 838.0, 1989.0, 838.0, 2019.0, 676.0, 2019.0], "score": 0.98, "text": "has the effect"}, {"category_id": 15, "poly": [134.0, 1989.0, 409.0, 1989.0, 409.0, 2019.0, 134.0, 2019.0], "score": 1.0, "text": "the feedback coefficient"}, {"category_id": 15, "poly": [431.0, 1989.0, 654.0, 1989.0, 654.0, 2019.0, 431.0, 2019.0], "score": 0.97, "text": ". This increasing of"}, {"category_id": 15, "poly": [864.0, 1920.0, 1566.0, 1920.0, 1566.0, 1952.0, 864.0, 1952.0], "score": 0.98, "text": "of averaging out noise in the per-pixel costs by selecting"}, {"category_id": 15, "poly": [861.0, 1950.0, 1566.0, 1948.0, 1566.0, 1987.0, 862.0, 1989.0], "score": 0.98, "text": "matches based more heavily upon the auxiliary cost, which"}, {"category_id": 15, "poly": [862.0, 1989.0, 1568.0, 1989.0, 1568.0, 2021.0, 862.0, 2021.0], "score": 0.99, "text": "is essentially a much more stable running average of the cost"}, {"category_id": 15, "poly": [864.0, 1788.0, 1564.0, 1785.0, 1564.0, 1817.0, 864.0, 1820.0], "score": 0.99, "text": "responding to the smallest mean squared error (MSE) of the"}, {"category_id": 15, "poly": [864.0, 1822.0, 1427.0, 1822.0, 1427.0, 1854.0, 864.0, 1854.0], "score": 0.99, "text": "disparity estimates for a range of noise strengths."}, {"category_id": 15, "poly": [862.0, 1748.0, 1488.0, 1753.0, 1488.0, 1785.0, 861.0, 1781.0], "score": 0.99, "text": "Figure 4: Optimal values of the feedback coefficient "}, {"category_id": 15, "poly": [1511.0, 1748.0, 1561.0, 1753.0, 1561.0, 1785.0, 1511.0, 1781.0], "score": 0.96, "text": "cor-"}, {"category_id": 15, "poly": [864.0, 866.0, 1566.0, 866.0, 1566.0, 898.0, 864.0, 898.0], "score": 0.99, "text": "Figure 3: Performance of temporal matching at different levels"}, {"category_id": 15, "poly": [864.0, 935.0, 1566.0, 933.0, 1566.0, 965.0, 864.0, 967.0], "score": 0.98, "text": "squared error (MSE) of disparities is plotted versus the values"}, {"category_id": 15, "poly": [864.0, 1001.0, 1492.0, 1001.0, 1492.0, 1031.0, 864.0, 1031.0], "score": 0.99, "text": "values of MSE obtained without temporal aggregation."}, {"category_id": 15, "poly": [864.0, 901.0, 1294.0, 901.0, 1294.0, 933.0, 864.0, 933.0], "score": 0.99, "text": "of uniformly distributed image noise"}, {"category_id": 15, "poly": [1484.0, 901.0, 1568.0, 901.0, 1568.0, 933.0, 1484.0, 933.0], "score": 0.99, "text": ".Mean"}, {"category_id": 15, "poly": [864.0, 967.0, 1175.0, 967.0, 1175.0, 999.0, 864.0, 999.0], "score": 0.99, "text": "of the feedback coefficient"}, {"category_id": 15, "poly": [1196.0, 967.0, 1568.0, 967.0, 1568.0, 999.0, 1196.0, 999.0], "score": 0.99, "text": ". Dashed lines correspond to the"}, {"category_id": 15, "poly": [857.0, 1061.0, 1566.0, 1068.0, 1566.0, 1107.0, 857.0, 1100.0], "score": 0.99, "text": " Table I: Parameters used in the evaluation of real-time tempo-"}, {"category_id": 15, "poly": [859.0, 1102.0, 1093.0, 1105.0, 1092.0, 1137.0, 859.0, 1134.0], "score": 1.0, "text": "ral stereo matching."}, {"category_id": 15, "poly": [1178.0, 151.0, 1282.0, 151.0, 1282.0, 186.0, 1178.0, 186.0], "score": 1.0, "text": "Noise: \u00b10"}, {"category_id": 15, "poly": [1178.0, 151.0, 1282.0, 151.0, 1282.0, 186.0, 1178.0, 186.0], "score": 1.0, "text": "Noise: \u00b10"}], "page_info": {"page_no": 3, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 5, "poly": [880.81298828125, 613.750244140625, 1552.5638427734375, 613.750244140625, 1552.5638427734375, 855.9174194335938, 880.81298828125, 855.9174194335938], "score": 0.9999957084655762}, {"category_id": 1, "poly": [862.7925415039062, 158.05548095703125, 1569.6671142578125, 158.05548095703125, 1569.6671142578125, 456.6153869628906, 862.7925415039062, 456.6153869628906], "score": 0.9999922513961792}, {"category_id": 1, "poly": [864.6585083007812, 1061.7374267578125, 1570.4825439453125, 1061.7374267578125, 1570.4825439453125, 1459.7132568359375, 864.6585083007812, 1459.7132568359375], "score": 0.9999921321868896}, {"category_id": 1, "poly": [130.64285278320312, 1519.7022705078125, 836.2221069335938, 1519.7022705078125, 836.2221069335938, 1882.68359375, 130.64285278320312, 1882.68359375], "score": 0.9999898672103882}, {"category_id": 1, "poly": [133.1135711669922, 158.4307861328125, 837.9683837890625, 158.4307861328125, 837.9683837890625, 323.343017578125, 133.1135711669922, 323.343017578125], "score": 0.9999892115592957}, {"category_id": 4, "poly": [132.3511199951172, 1347.8763427734375, 839.7514038085938, 1347.8763427734375, 839.7514038085938, 1476.9757080078125, 132.3511199951172, 1476.9757080078125], "score": 0.9999880790710449}, {"category_id": 7, "poly": [887.6280517578125, 860.9362182617188, 1551.5972900390625, 860.9362182617188, 1551.5972900390625, 964.0142211914062, 887.6280517578125, 964.0142211914062], "score": 0.9999836683273315}, {"category_id": 1, "poly": [869.9986572265625, 1514.7762451171875, 1571.624755859375, 1514.7762451171875, 1571.624755859375, 2022.618896484375, 869.9986572265625, 2022.618896484375], "score": 0.9999811053276062}, {"category_id": 3, "poly": [164.82151794433594, 352.74810791015625, 805.8219604492188, 352.74810791015625, 805.8219604492188, 1320.43310546875, 164.82151794433594, 1320.43310546875], "score": 0.9999799728393555}, {"category_id": 0, "poly": [1137.668701171875, 1477.0120849609375, 1293.498046875, 1477.0120849609375, 1293.498046875, 1502.5439453125, 1137.668701171875, 1502.5439453125], "score": 0.9999679327011108}, {"category_id": 1, "poly": [133.0285186767578, 1886.7501220703125, 837.0147705078125, 1886.7501220703125, 837.0147705078125, 2018.0294189453125, 133.0285186767578, 2018.0294189453125], "score": 0.9999630451202393}, {"category_id": 0, "poly": [1114.8399658203125, 1022.4933471679688, 1317.0313720703125, 1022.4933471679688, 1317.0313720703125, 1052.679931640625, 1114.8399658203125, 1052.679931640625], "score": 0.9999338984489441}, {"category_id": 1, "poly": [862.0576171875, 480.8196105957031, 1565.8367919921875, 480.8196105957031, 1565.8367919921875, 577.5508422851562, 862.0576171875, 577.5508422851562], "score": 0.8958550691604614}, {"category_id": 6, "poly": [862.0606079101562, 480.7809753417969, 1565.667724609375, 480.7809753417969, 1565.667724609375, 577.4689331054688, 862.0606079101562, 577.4689331054688], "score": 0.4145430028438568}, {"category_id": 13, "poly": [736, 1445, 827, 1445, 827, 1475, 736, 1475], "score": 0.9, "latex": "\\lambda=0.8"}, {"category_id": 13, "poly": [1003, 887, 1105, 887, 1105, 911, 1003, 911], "score": 0.89, "latex": "320\\times240"}, {"category_id": 13, "poly": [338, 1446, 391, 1446, 391, 1475, 338, 1475], "score": 0.87, "latex": "\\pm30"}, {"category_id": 13, "poly": [166, 1619, 219, 1619, 219, 1649, 166, 1649], "score": 0.85, "latex": "\\pm40"}, {"category_id": 13, "poly": [301, 196, 329, 196, 329, 224, 301, 224], "score": 0.84, "latex": "\\gamma_{t}"}, {"category_id": 13, "poly": [795, 1586, 836, 1586, 836, 1616, 795, 1616], "score": 0.84, "latex": "\\pm0"}, {"category_id": 13, "poly": [1037, 939, 1059, 939, 1059, 960, 1037, 960], "score": 0.83, "latex": "\\%"}, {"category_id": 13, "poly": [462, 1586, 482, 1586, 482, 1613, 462, 1613], "score": 0.78, "latex": "\\lambda"}, {"category_id": 15, "poly": [862.0, 160.0, 1571.0, 160.0, 1571.0, 192.0, 862.0, 192.0], "score": 0.98, "text": "the proposed implementation achieves the highest speed of"}, {"category_id": 15, "poly": [864.0, 195.0, 1566.0, 195.0, 1566.0, 227.0, 864.0, 227.0], "score": 0.99, "text": "operation measured by the number of disparity hypotheses"}, {"category_id": 15, "poly": [864.0, 227.0, 1568.0, 227.0, 1568.0, 259.0, 864.0, 259.0], "score": 0.99, "text": "evaluated per second, as shown in Table I1. It is also the second"}, {"category_id": 15, "poly": [862.0, 261.0, 1568.0, 261.0, 1568.0, 293.0, 862.0, 293.0], "score": 0.99, "text": "most accurate real-time method in terms of error rate, as"}, {"category_id": 15, "poly": [864.0, 296.0, 1564.0, 296.0, 1564.0, 325.0, 864.0, 325.0], "score": 1.0, "text": "measured using the Middlebury stereo evaluation benchmark."}, {"category_id": 15, "poly": [859.0, 323.0, 1568.0, 325.0, 1568.0, 358.0, 859.0, 355.0], "score": 0.98, "text": " It should be noted that it is difficult to establish an unbiased"}, {"category_id": 15, "poly": [862.0, 358.0, 1566.0, 358.0, 1566.0, 390.0, 862.0, 390.0], "score": 1.0, "text": "metric for speed comparisons, as the architecture, number of"}, {"category_id": 15, "poly": [866.0, 394.0, 1568.0, 394.0, 1568.0, 426.0, 866.0, 426.0], "score": 0.98, "text": "cores, and clock speed of graphics hardware used are not"}, {"category_id": 15, "poly": [862.0, 424.0, 1259.0, 429.0, 1259.0, 461.0, 861.0, 456.0], "score": 0.99, "text": "consistent across implementations."}, {"category_id": 15, "poly": [889.0, 1061.0, 1571.0, 1061.0, 1571.0, 1100.0, 889.0, 1100.0], "score": 1.0, "text": "While the majority of stereo matching algorithms focus"}, {"category_id": 15, "poly": [859.0, 1093.0, 1571.0, 1095.0, 1571.0, 1134.0, 859.0, 1132.0], "score": 0.99, "text": " on achieving high accuracy on still images, the volume of"}, {"category_id": 15, "poly": [862.0, 1130.0, 1564.0, 1130.0, 1564.0, 1162.0, 862.0, 1162.0], "score": 0.99, "text": "research aimed at recovery of temporally consistent disparity"}, {"category_id": 15, "poly": [862.0, 1162.0, 1568.0, 1162.0, 1568.0, 1201.0, 862.0, 1201.0], "score": 0.99, "text": "maps remains disproportionally small. This paper introduces"}, {"category_id": 15, "poly": [862.0, 1196.0, 1568.0, 1196.0, 1568.0, 1235.0, 862.0, 1235.0], "score": 0.98, "text": "an efficient temporal cost aggregation scheme that can easily"}, {"category_id": 15, "poly": [859.0, 1226.0, 1571.0, 1228.0, 1571.0, 1267.0, 859.0, 1265.0], "score": 0.99, "text": "be combined with conventional spatial cost aggregation to"}, {"category_id": 15, "poly": [864.0, 1265.0, 1568.0, 1265.0, 1568.0, 1297.0, 864.0, 1297.0], "score": 1.0, "text": "improve the accuracy of stereo matching when operating on"}, {"category_id": 15, "poly": [864.0, 1297.0, 1568.0, 1297.0, 1568.0, 1329.0, 864.0, 1329.0], "score": 0.99, "text": "video sequences. A synthetic video sequence, along with"}, {"category_id": 15, "poly": [864.0, 1331.0, 1568.0, 1331.0, 1568.0, 1364.0, 864.0, 1364.0], "score": 0.99, "text": "ground truth disparity data, was generated to evaluate the"}, {"category_id": 15, "poly": [862.0, 1361.0, 1571.0, 1361.0, 1571.0, 1400.0, 862.0, 1400.0], "score": 0.98, "text": "performance of the proposed method. It was shown that"}, {"category_id": 15, "poly": [864.0, 1398.0, 1571.0, 1398.0, 1571.0, 1430.0, 864.0, 1430.0], "score": 0.98, "text": "temporal aggregation is significantly more robust to noise than"}, {"category_id": 15, "poly": [862.0, 1430.0, 1497.0, 1430.0, 1497.0, 1462.0, 862.0, 1462.0], "score": 0.99, "text": "a method that only considers the current stereo frames."}, {"category_id": 15, "poly": [157.0, 1517.0, 838.0, 1517.0, 838.0, 1556.0, 157.0, 1556.0], "score": 0.99, "text": "The optimal value of the feedback coefficient is largely"}, {"category_id": 15, "poly": [134.0, 1554.0, 836.0, 1554.0, 836.0, 1584.0, 134.0, 1584.0], "score": 0.97, "text": "dependent on the noise being added to the image. Figure 4"}, {"category_id": 15, "poly": [132.0, 1655.0, 838.0, 1655.0, 838.0, 1684.0, 132.0, 1684.0], "score": 0.99, "text": "rely on the auxiliary cost when noise is high and it is more"}, {"category_id": 15, "poly": [132.0, 1684.0, 839.0, 1689.0, 838.0, 1721.0, 132.0, 1716.0], "score": 0.98, "text": "beneficial to rely on the current cost when noise is low. Figure"}, {"category_id": 15, "poly": [132.0, 1719.0, 839.0, 1723.0, 838.0, 1755.0, 132.0, 1751.0], "score": 1.0, "text": "5 illustrates the improvements that are achieved when applying"}, {"category_id": 15, "poly": [134.0, 1755.0, 836.0, 1755.0, 836.0, 1785.0, 134.0, 1785.0], "score": 0.98, "text": "temporal stereo matching to a particular pair of frames in the"}, {"category_id": 15, "poly": [134.0, 1788.0, 834.0, 1788.0, 834.0, 1820.0, 134.0, 1820.0], "score": 1.0, "text": "synthetic video sequence. Clearly, the noise in the disparity"}, {"category_id": 15, "poly": [134.0, 1822.0, 836.0, 1822.0, 836.0, 1854.0, 134.0, 1854.0], "score": 0.99, "text": "map is drastically reduced when temporal stereo matching is"}, {"category_id": 15, "poly": [132.0, 1856.0, 196.0, 1856.0, 196.0, 1886.0, 132.0, 1886.0], "score": 1.0, "text": "used."}, {"category_id": 15, "poly": [132.0, 1620.0, 165.0, 1620.0, 165.0, 1652.0, 132.0, 1652.0], "score": 0.99, "text": "to"}, {"category_id": 15, "poly": [220.0, 1620.0, 838.0, 1620.0, 838.0, 1652.0, 220.0, 1652.0], "score": 0.98, "text": ". As intuition would suggest, it is more beneficial to"}, {"category_id": 15, "poly": [127.0, 1584.0, 461.0, 1581.0, 461.0, 1620.0, 127.0, 1623.0], "score": 0.96, "text": " shows the optimal values of"}, {"category_id": 15, "poly": [483.0, 1584.0, 794.0, 1581.0, 794.0, 1620.0, 483.0, 1623.0], "score": 0.99, "text": "for noise ranging between"}, {"category_id": 15, "poly": [134.0, 160.0, 836.0, 160.0, 836.0, 192.0, 134.0, 192.0], "score": 0.99, "text": "over the most recent frames. By maintaining a reasonably"}, {"category_id": 15, "poly": [134.0, 229.0, 836.0, 229.0, 836.0, 261.0, 134.0, 261.0], "score": 0.98, "text": "edges, essentially reducing over-smoothing of a pixel's dis-"}, {"category_id": 15, "poly": [132.0, 261.0, 838.0, 261.0, 838.0, 293.0, 132.0, 293.0], "score": 0.99, "text": "parity when a pixel transitions from one depth to another in"}, {"category_id": 15, "poly": [130.0, 293.0, 354.0, 296.0, 353.0, 328.0, 129.0, 325.0], "score": 1.0, "text": "subsequent frames."}, {"category_id": 15, "poly": [134.0, 192.0, 300.0, 192.0, 300.0, 225.0, 134.0, 225.0], "score": 0.93, "text": "high value of"}, {"category_id": 15, "poly": [330.0, 192.0, 836.0, 192.0, 836.0, 225.0, 330.0, 225.0], "score": 0.99, "text": ", the auxiliary cost also preserves temporal"}, {"category_id": 15, "poly": [132.0, 1345.0, 836.0, 1348.0, 836.0, 1382.0, 132.0, 1380.0], "score": 1.0, "text": "Figure 5: A comparison of stereo matching without temporal"}, {"category_id": 15, "poly": [132.0, 1382.0, 834.0, 1382.0, 834.0, 1414.0, 132.0, 1414.0], "score": 0.98, "text": "cost aggregation (top\uff09 and with temporal cost aggregation"}, {"category_id": 15, "poly": [134.0, 1416.0, 836.0, 1416.0, 836.0, 1446.0, 134.0, 1446.0], "score": 0.98, "text": "(bottom) for a single frame in the synthetic video sequence"}, {"category_id": 15, "poly": [134.0, 1448.0, 337.0, 1446.0, 337.0, 1478.0, 134.0, 1480.0], "score": 0.98, "text": "where the noise is"}, {"category_id": 15, "poly": [392.0, 1448.0, 735.0, 1446.0, 735.0, 1478.0, 392.0, 1480.0], "score": 0.99, "text": "and the feedback coefficient is"}, {"category_id": 15, "poly": [896.0, 855.0, 1324.0, 857.0, 1323.0, 896.0, 896.0, 894.0], "score": 0.95, "text": "1I Millions of Disparity Estimates per Second."}, {"category_id": 15, "poly": [903.0, 912.0, 1550.0, 912.0, 1550.0, 944.0, 903.0, 944.0], "score": 0.99, "text": "3 As measured by the Middlebury stereo performance benchmark using"}, {"category_id": 15, "poly": [901.0, 887.0, 1002.0, 887.0, 1002.0, 919.0, 901.0, 919.0], "score": 0.99, "text": "2Assumes"}, {"category_id": 15, "poly": [1106.0, 887.0, 1404.0, 887.0, 1404.0, 919.0, 1106.0, 919.0], "score": 0.98, "text": "images with 32 disparity levels."}, {"category_id": 15, "poly": [915.0, 937.0, 1036.0, 937.0, 1036.0, 969.0, 915.0, 969.0], "score": 0.96, "text": "the avgerage"}, {"category_id": 15, "poly": [1060.0, 937.0, 1192.0, 937.0, 1192.0, 969.0, 1060.0, 969.0], "score": 0.96, "text": "of bad pixels."}, {"category_id": 15, "poly": [873.0, 1515.0, 1571.0, 1515.0, 1571.0, 1545.0, 873.0, 1545.0], "score": 0.97, "text": "[1] D. Scharstein and R. Szeliski, \u201cA taxonomy and evaluation of dense "}, {"category_id": 15, "poly": [915.0, 1542.0, 1573.0, 1542.0, 1573.0, 1572.0, 915.0, 1572.0], "score": 0.98, "text": "two-frame stereo correspondence algorithms\u201d\u2019 International Journal of"}, {"category_id": 15, "poly": [915.0, 1565.0, 1409.0, 1565.0, 1409.0, 1597.0, 915.0, 1597.0], "score": 0.98, "text": "Computer Vision, vol. 47, pp. 7-42, April-June 2002."}, {"category_id": 15, "poly": [871.0, 1588.0, 1568.0, 1590.0, 1568.0, 1623.0, 871.0, 1620.0], "score": 0.98, "text": "[2]  D. Scharstein and R. Szeliski, \u201cHigh-accuracy stereo depth maps using"}, {"category_id": 15, "poly": [915.0, 1616.0, 1568.0, 1616.0, 1568.0, 1648.0, 915.0, 1648.0], "score": 0.97, "text": "structured light,\u201d in In IEEE Computer Society Conference on Computer"}, {"category_id": 15, "poly": [915.0, 1641.0, 1508.0, 1641.0, 1508.0, 1673.0, 915.0, 1673.0], "score": 0.98, "text": "Vision and Pattern Recognition, vol. 1, pp. 195-202, June 2003."}, {"category_id": 15, "poly": [873.0, 1666.0, 1568.0, 1666.0, 1568.0, 1696.0, 873.0, 1696.0], "score": 0.99, "text": "[3] J. Kowalczuk, E. Psota, and L. Perez, \u201cReal-time stereo matching on"}, {"category_id": 15, "poly": [912.0, 1689.0, 1571.0, 1689.0, 1571.0, 1721.0, 912.0, 1721.0], "score": 0.98, "text": " CUDA using an iterative refinement method for adaptive support-weight"}, {"category_id": 15, "poly": [915.0, 1714.0, 1571.0, 1714.0, 1571.0, 1746.0, 915.0, 1746.0], "score": 0.99, "text": "correspondences,\u201d Circuits and Systems for Video Technology, IEEE"}, {"category_id": 15, "poly": [908.0, 1737.0, 1374.0, 1735.0, 1374.0, 1774.0, 908.0, 1776.0], "score": 0.96, "text": "Transactions on, vol. 23, Ppp. 94 -104, Jan. 2013."}, {"category_id": 15, "poly": [873.0, 1765.0, 1568.0, 1765.0, 1568.0, 1797.0, 873.0, 1797.0], "score": 0.99, "text": "[4]  K.-J. Yoon and I.-S. Kweon, Locally adaptive support-weight approach"}, {"category_id": 15, "poly": [912.0, 1790.0, 1571.0, 1790.0, 1571.0, 1822.0, 912.0, 1822.0], "score": 0.97, "text": "for visual correspondence search,' in CVPR'05: Proceedings of the 2005"}, {"category_id": 15, "poly": [915.0, 1815.0, 1571.0, 1815.0, 1571.0, 1847.0, 915.0, 1847.0], "score": 0.96, "text": "IEEE Computer Society Conference on ComputerVision andPattern"}, {"category_id": 15, "poly": [915.0, 1840.0, 1568.0, 1840.0, 1568.0, 1872.0, 915.0, 1872.0], "score": 0.97, "text": "Recognition (CVPR'05) - Volume 2, (Washington, DC, USA), Pp. 924-"}, {"category_id": 15, "poly": [912.0, 1863.0, 1247.0, 1863.0, 1247.0, 1895.0, 912.0, 1895.0], "score": 0.98, "text": "931, IEEE Computer Society, 2005."}, {"category_id": 15, "poly": [873.0, 1891.0, 1568.0, 1891.0, 1568.0, 1923.0, 873.0, 1923.0], "score": 0.97, "text": "[5] L. Wang, M. Liao, M. Gong, R. Yang, and D. Nister, \u201cHigh-quality real-"}, {"category_id": 15, "poly": [912.0, 1916.0, 1566.0, 1916.0, 1566.0, 1946.0, 912.0, 1946.0], "score": 0.99, "text": "time stereo using adaptive cost aggregation and dynamic programming,\""}, {"category_id": 15, "poly": [910.0, 1936.0, 1568.0, 1939.0, 1568.0, 1971.0, 910.0, 1969.0], "score": 0.94, "text": "in 3DPVT'06:Proceedings of the Third International Symposium"}, {"category_id": 15, "poly": [915.0, 1964.0, 1568.0, 1964.0, 1568.0, 1996.0, 915.0, 1996.0], "score": 0.98, "text": "on 3D Data Processing, Visualization, and Transmission (3DPVT'06),"}, {"category_id": 15, "poly": [915.0, 1989.0, 1564.0, 1989.0, 1564.0, 2021.0, 915.0, 2021.0], "score": 1.0, "text": "(Washington, DC, USA), Pp. 798-805, IEEE Computer Society, 2006."}, {"category_id": 15, "poly": [1134.0, 1471.0, 1296.0, 1471.0, 1296.0, 1510.0, 1134.0, 1510.0], "score": 1.0, "text": "REFERENCES"}, {"category_id": 15, "poly": [159.0, 1888.0, 836.0, 1888.0, 836.0, 1920.0, 159.0, 1920.0], "score": 0.99, "text": "The algorithm was implement using NVIDIA's Compute"}, {"category_id": 15, "poly": [134.0, 1920.0, 834.0, 1920.0, 834.0, 1950.0, 134.0, 1950.0], "score": 0.98, "text": "Unified Device Architecture (CUDA). The details of the im-"}, {"category_id": 15, "poly": [129.0, 1948.0, 841.0, 1950.0, 841.0, 1989.0, 129.0, 1987.0], "score": 0.98, "text": " plementation are similar to those given in [3]. When compared "}, {"category_id": 15, "poly": [132.0, 1989.0, 836.0, 1989.0, 836.0, 2021.0, 132.0, 2021.0], "score": 0.99, "text": "to other existing real-time stereo matching implementations,"}, {"category_id": 15, "poly": [1111.0, 1022.0, 1317.0, 1022.0, 1317.0, 1061.0, 1111.0, 1061.0], "score": 1.0, "text": "V. CONCLUSION"}, {"category_id": 15, "poly": [864.0, 484.0, 1564.0, 484.0, 1564.0, 516.0, 864.0, 516.0], "score": 0.99, "text": "Table II: A comparison of speed and accuracy for the imple-"}, {"category_id": 15, "poly": [864.0, 518.0, 1564.0, 518.0, 1564.0, 550.0, 864.0, 550.0], "score": 0.99, "text": "mentations of many leading real-time stereo matching meth-"}, {"category_id": 15, "poly": [862.0, 550.0, 917.0, 550.0, 917.0, 584.0, 862.0, 584.0], "score": 0.96, "text": "ods."}, {"category_id": 15, "poly": [864.0, 484.0, 1564.0, 484.0, 1564.0, 516.0, 864.0, 516.0], "score": 0.99, "text": "Table II: A comparison of speed and accuracy for the imple-"}, {"category_id": 15, "poly": [864.0, 518.0, 1564.0, 518.0, 1564.0, 550.0, 864.0, 550.0], "score": 0.99, "text": "mentations of many leading real-time stereo matching meth-"}, {"category_id": 15, "poly": [862.0, 550.0, 917.0, 550.0, 917.0, 584.0, 862.0, 584.0], "score": 0.96, "text": "ods."}], "page_info": {"page_no": 4, "height": 2200, "width": 1700}}, {"layout_dets": [{"category_id": 1, "poly": [134.58497619628906, 157.681884765625, 841.3460693359375, 157.681884765625, 841.3460693359375, 1666.27001953125, 134.58497619628906, 1666.27001953125], "score": 0.9999936819076538}, {"category_id": 15, "poly": [143.0, 163.0, 838.0, 163.0, 838.0, 192.0, 143.0, 192.0], "score": 0.97, "text": "[6] W. Yu, T. Chen, F. Franchetti, and J. C. Hoe, \u201cHigh performance stereo"}, {"category_id": 15, "poly": [182.0, 188.0, 838.0, 188.0, 838.0, 218.0, 182.0, 218.0], "score": 0.98, "text": "vision designed for massively data parallel platforms,\u2019 Circuits and"}, {"category_id": 15, "poly": [182.0, 213.0, 841.0, 213.0, 841.0, 245.0, 182.0, 245.0], "score": 0.98, "text": "Systems for Video Technology, IEEE Transactions on, vol. 20, pp. 1509"}, {"category_id": 15, "poly": [182.0, 238.0, 411.0, 238.0, 411.0, 268.0, 182.0, 268.0], "score": 0.98, "text": "-1519, November 2010."}, {"category_id": 15, "poly": [143.0, 264.0, 838.0, 264.0, 838.0, 293.0, 143.0, 293.0], "score": 0.99, "text": "[7] S. Mattoccia, M. Viti, and F. Ries, \u201cNear real-time fast bilateral stereo"}, {"category_id": 15, "poly": [182.0, 289.0, 838.0, 289.0, 838.0, 319.0, 182.0, 319.0], "score": 0.96, "text": "on the GPU in Computer Vision and Pattern Recognition Workshops"}, {"category_id": 15, "poly": [178.0, 307.0, 841.0, 309.0, 841.0, 348.0, 178.0, 346.0], "score": 0.95, "text": "(CVPRW), 2011 IEEE Computer Society Conference on,Ppp. 136 -143,"}, {"category_id": 15, "poly": [185.0, 339.0, 289.0, 339.0, 289.0, 364.0, 185.0, 364.0], "score": 0.98, "text": "June 2011."}, {"category_id": 15, "poly": [141.0, 362.0, 838.0, 362.0, 838.0, 392.0, 141.0, 392.0], "score": 0.98, "text": "[8] K. Zhang, J. Lu, Q. Yang, G. Lafruit, R. Lauwereins, and L. Van Gool,"}, {"category_id": 15, "poly": [182.0, 387.0, 838.0, 387.0, 838.0, 419.0, 182.0, 419.0], "score": 0.98, "text": "\"Real-time and accurate stereo: A scalable approach with bitwise fast"}, {"category_id": 15, "poly": [185.0, 412.0, 838.0, 412.0, 838.0, 445.0, 185.0, 445.0], "score": 0.97, "text": "voting on CUDA,\u201d Circuits and Systems for Video Technology, IEEE"}, {"category_id": 15, "poly": [182.0, 438.0, 656.0, 438.0, 656.0, 468.0, 182.0, 468.0], "score": 0.99, "text": "Transactions on, vol. 21, pp. 867 -878, July 2011."}, {"category_id": 15, "poly": [141.0, 463.0, 838.0, 463.0, 838.0, 493.0, 141.0, 493.0], "score": 0.96, "text": "[9]  C. Rhemann, A. Hosni, M. Bleyer, C. Rother, and M. Gelautz, \u201cFast cost-"}, {"category_id": 15, "poly": [182.0, 488.0, 838.0, 488.0, 838.0, 518.0, 182.0, 518.0], "score": 0.98, "text": "volume filtering for visual correspondence and beyond,\" in Computer"}, {"category_id": 15, "poly": [180.0, 509.0, 841.0, 511.0, 841.0, 543.0, 180.0, 541.0], "score": 0.95, "text": "Vision and Pattern Recognition (CVPR), 20ll IEEE Conference on,"}, {"category_id": 15, "poly": [180.0, 536.0, 448.0, 534.0, 448.0, 566.0, 180.0, 568.0], "score": 0.99, "text": "Pp. 3017 -3024, June 2011."}, {"category_id": 15, "poly": [134.0, 561.0, 838.0, 561.0, 838.0, 591.0, 134.0, 591.0], "score": 0.99, "text": "[10] A. Hosni, C. Rhemann, M. Bleyer, and M. Gelautz, \u201cTemporally con-"}, {"category_id": 15, "poly": [180.0, 587.0, 836.0, 587.0, 836.0, 616.0, 180.0, 616.0], "score": 0.99, "text": " sistent disparity and optical flow via efficient spatio-temporal filtering,\""}, {"category_id": 15, "poly": [182.0, 612.0, 838.0, 612.0, 838.0, 642.0, 182.0, 642.0], "score": 0.97, "text": "in Advances in Image and Video Technology (Y.-S. Ho, ed.), vol. 7087"}, {"category_id": 15, "poly": [180.0, 632.0, 845.0, 632.0, 845.0, 671.0, 180.0, 671.0], "score": 0.88, "text": "of Lectureotes inComputer Science,pp.16517,Springererlin /"}, {"category_id": 15, "poly": [182.0, 660.0, 353.0, 660.0, 353.0, 692.0, 182.0, 692.0], "score": 1.0, "text": "Heidelberg, 2012."}, {"category_id": 15, "poly": [134.0, 685.0, 838.0, 685.0, 838.0, 717.0, 134.0, 717.0], "score": 0.98, "text": "[11] C. Tomasi and R. Manduchi, \u201cBilateral filtering for gray and color"}, {"category_id": 15, "poly": [182.0, 710.0, 838.0, 710.0, 838.0, 742.0, 182.0, 742.0], "score": 0.98, "text": "images,\u201d in Computer Vision, 1998. Sixth International Conference on,"}, {"category_id": 15, "poly": [180.0, 736.0, 411.0, 731.0, 411.0, 763.0, 181.0, 768.0], "score": 0.93, "text": "pPp. 839 -846, jan 1998."}, {"category_id": 15, "poly": [132.0, 761.0, 838.0, 761.0, 838.0, 791.0, 132.0, 791.0], "score": 0.97, "text": "[12] K. He, J. Sun, and X. Tang, \u201cGuided image filtering,\u201d\u2019 in Computer"}, {"category_id": 15, "poly": [180.0, 784.0, 838.0, 786.0, 838.0, 818.0, 180.0, 816.0], "score": 0.98, "text": "Vision - ECCV 2010, vol. 6311 of Lecture Notes in Computer Science,"}, {"category_id": 15, "poly": [180.0, 811.0, 607.0, 807.0, 608.0, 839.0, 180.0, 843.0], "score": 0.98, "text": "pp. 1-14, Springer Berlin / Heidelberg, 2010."}, {"category_id": 15, "poly": [129.0, 832.0, 839.0, 837.0, 838.0, 869.0, 129.0, 864.0], "score": 0.98, "text": "[13] L. Zhang, B. Curless, and S. M. Seitz, \u201cSpacetime stereo: Shape"}, {"category_id": 15, "poly": [182.0, 862.0, 836.0, 862.0, 836.0, 891.0, 182.0, 891.0], "score": 0.98, "text": "recovery for dynamic scenes,\u201d in IEEE Computer Society Conference"}, {"category_id": 15, "poly": [182.0, 885.0, 834.0, 885.0, 834.0, 917.0, 182.0, 917.0], "score": 0.97, "text": "on Computer Vision and Pattern Recognition, pp. 367-374, June 2003."}, {"category_id": 15, "poly": [132.0, 910.0, 838.0, 910.0, 838.0, 940.0, 132.0, 940.0], "score": 0.98, "text": "[14] J. Davis, D. Nehab, R. Ramamoorthi, and S. Rusinkiewicz, \u201cSpacetime"}, {"category_id": 15, "poly": [182.0, 935.0, 838.0, 935.0, 838.0, 965.0, 182.0, 965.0], "score": 0.97, "text": "stereo: a unifying framework for depth from triangulation,\u201d\u2019 Pattern"}, {"category_id": 15, "poly": [182.0, 960.0, 838.0, 960.0, 838.0, 990.0, 182.0, 990.0], "score": 0.98, "text": "Analysis and Machine Intelligence, IEEE Transactions on,vol. 27,"}, {"category_id": 15, "poly": [180.0, 983.0, 462.0, 983.0, 462.0, 1015.0, 180.0, 1015.0], "score": 0.97, "text": "Pp. 296 -302, February 2005."}, {"category_id": 15, "poly": [132.0, 1011.0, 838.0, 1011.0, 838.0, 1040.0, 132.0, 1040.0], "score": 0.99, "text": "[15] E. Larsen, P. Mordohai, M. Pollefeys, and H. Fuchs, \u201cTemporally"}, {"category_id": 15, "poly": [182.0, 1036.0, 836.0, 1036.0, 836.0, 1066.0, 182.0, 1066.0], "score": 0.99, "text": "consistent reconstruction from multiple video streams using enhanced"}, {"category_id": 15, "poly": [178.0, 1054.0, 843.0, 1056.0, 843.0, 1095.0, 178.0, 1093.0], "score": 0.95, "text": "belief propagation in Computer Vision, 2007.ICCV 2007. IEEE1lth"}, {"category_id": 15, "poly": [180.0, 1082.0, 644.0, 1082.0, 644.0, 1121.0, 180.0, 1121.0], "score": 0.97, "text": "International Conference on, pp. 1 -8, oct. 2007."}, {"category_id": 15, "poly": [134.0, 1109.0, 838.0, 1109.0, 838.0, 1141.0, 134.0, 1141.0], "score": 0.97, "text": "[16]  M. Bleyer, M. Gelautz, C. Rother, and C. Rhemann, \u201c\"A stereo approach"}, {"category_id": 15, "poly": [180.0, 1134.0, 838.0, 1134.0, 838.0, 1166.0, 180.0, 1166.0], "score": 0.99, "text": "that handles the mating problem via image warping\" in Computer"}, {"category_id": 15, "poly": [182.0, 1157.0, 838.0, 1157.0, 838.0, 1189.0, 182.0, 1189.0], "score": 0.98, "text": "Vision and Pattern Recognition, 2009. CVPR 2009. IEEE Conference"}, {"category_id": 15, "poly": [180.0, 1183.0, 459.0, 1175.0, 460.0, 1212.0, 181.0, 1219.0], "score": 0.98, "text": "on, pp. 501 -508, June 2009."}, {"category_id": 15, "poly": [129.0, 1205.0, 838.0, 1208.0, 838.0, 1240.0, 129.0, 1237.0], "score": 0.98, "text": " [17] M. Sizintsev and R. Wildes, \u201cSpatiotemporal stereo via spatiotemporal"}, {"category_id": 15, "poly": [182.0, 1235.0, 838.0, 1235.0, 838.0, 1265.0, 182.0, 1265.0], "score": 0.97, "text": "quadric element (stequel) matching,\u201d in Computer Vision and Pattern"}, {"category_id": 15, "poly": [185.0, 1258.0, 841.0, 1258.0, 841.0, 1290.0, 185.0, 1290.0], "score": 0.98, "text": "Recognition, 2009. CVPR 2009. IEEE Conference on, Pp. 493 -500,"}, {"category_id": 15, "poly": [185.0, 1286.0, 286.0, 1286.0, 286.0, 1311.0, 185.0, 1311.0], "score": 0.99, "text": "june 2009."}, {"category_id": 15, "poly": [132.0, 1309.0, 838.0, 1309.0, 838.0, 1338.0, 132.0, 1338.0], "score": 0.97, "text": "[18] M. Sizintsev and R. Wildes, \u201cSpatiotemporal stereo and scene flow via"}, {"category_id": 15, "poly": [182.0, 1334.0, 841.0, 1334.0, 841.0, 1364.0, 182.0, 1364.0], "score": 0.97, "text": "stequel matching,\u201d\u2019Pattern Analysis and Machine Intelligence, IEEE"}, {"category_id": 15, "poly": [182.0, 1359.0, 684.0, 1359.0, 684.0, 1391.0, 182.0, 1391.0], "score": 1.0, "text": "Transactions on, vol. 34, pp. 1206 -1219, june 2012."}, {"category_id": 15, "poly": [132.0, 1382.0, 834.0, 1382.0, 834.0, 1412.0, 132.0, 1412.0], "score": 0.98, "text": "[19] C. Richardt, D. Orr, I. Davies, A. Criminisi, and N. A. Dodgson,"}, {"category_id": 15, "poly": [185.0, 1409.0, 838.0, 1409.0, 838.0, 1441.0, 185.0, 1441.0], "score": 0.98, "text": "\"Real-time spatiotemporal stereo matching using the dual-cross-bilateral"}, {"category_id": 15, "poly": [182.0, 1432.0, 838.0, 1432.0, 838.0, 1464.0, 182.0, 1464.0], "score": 0.95, "text": "grid,\" in Proceedings of the European Conference on Computer Vision"}, {"category_id": 15, "poly": [182.0, 1458.0, 838.0, 1458.0, 838.0, 1490.0, 182.0, 1490.0], "score": 0.98, "text": "(ECCV), Lecture Notes in Computer Science, pp. 510-523, September"}, {"category_id": 15, "poly": [182.0, 1477.0, 243.0, 1483.0, 241.0, 1511.0, 179.0, 1505.0], "score": 1.0, "text": "2010."}, {"category_id": 15, "poly": [134.0, 1508.0, 836.0, 1508.0, 836.0, 1538.0, 134.0, 1538.0], "score": 0.98, "text": "[20] S. Paris and F. Durand, \u201cA fast approximation of the bilateral filter using"}, {"category_id": 15, "poly": [182.0, 1533.0, 836.0, 1533.0, 836.0, 1565.0, 182.0, 1565.0], "score": 0.98, "text": "a signal processing approach,\u201d Int. J. Comput. Vision, vol. 81, pp. 24-52,"}, {"category_id": 15, "poly": [185.0, 1561.0, 282.0, 1561.0, 282.0, 1586.0, 185.0, 1586.0], "score": 0.98, "text": "Jan. 2009."}, {"category_id": 15, "poly": [134.0, 1584.0, 836.0, 1584.0, 836.0, 1613.0, 134.0, 1613.0], "score": 0.98, "text": "[21] Q. Yang, L. Wang, R. Yang, S. Wang, M. Liao, and D. Nist\u00e9r, \u201cReal-"}, {"category_id": 15, "poly": [182.0, 1609.0, 838.0, 1609.0, 838.0, 1641.0, 182.0, 1641.0], "score": 0.98, "text": "time global stereo matching using hierarchical belief propagation.\u201d in"}, {"category_id": 15, "poly": [182.0, 1634.0, 698.0, 1634.0, 698.0, 1666.0, 182.0, 1666.0], "score": 1.0, "text": "British Machine Vision Conference, pp. 989-998, 2006."}], "page_info": {"page_no": 5, "height": 2200, "width": 1700}}]
\ No newline at end of file
diff --git a/demo/demo2.pdf b/demo/demo2.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..c8659f212cfb28350c1d71c44bd84fe58df2ca24
--- /dev/null
+++ b/demo/demo2.pdf
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9e94e95637356e1599510436278747d1150a3dfb822233bdc77a9dcb9a4fc6e4
+size 1808096
diff --git a/docs/FAQ_zh_cn.md b/docs/FAQ_zh_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..c5f76e1017a77b8fcd08091f2bebe09116ec1154
--- /dev/null
+++ b/docs/FAQ_zh_cn.md
@@ -0,0 +1,85 @@
+# 常见问题解答
+
+### 1.离线部署首次运行，报错urllib.error.URLError: <urlopen error [Errno 101] Network is unreachable>
+    
+首次运行需要在线下载一个小的语言检测模型，如果是离线部署需要手动下载该模型并放到指定目录。  
+参考：https://github.com/opendatalab/MinerU/issues/121
+
+### 2.在较新版本的mac上使用命令安装pip install magic-pdf[full-cpu] zsh: no matches found: magic-pdf[full-cpu]
+
+在 macOS 上，默认的 shell 从 Bash 切换到了 Z shell，而 Z shell 对于某些类型的字符串匹配有特殊的处理逻辑，这可能导致no matches found错误。
+可以通过在命令行禁用globbing特性，再尝试运行安装命令
+```bash
+setopt no_nomatch
+pip install magic-pdf[full-cpu]
+```
+
+### 3.在intel cpu 的mac上 安装最新版的完整功能包 magic-pdf[full-cpu] (0.6.x) 不成功
+
+完整功能包依赖的公式解析库unimernet限制了pytorch的最低版本为2.3.0，而pytorch官方没有为intel cpu的macOS 提供2.3.0版本的预编译包，所以会产生依赖不兼容的问题。
+可以先尝试安装unimernet的老版本之后再尝试安装完整功能包的其他依赖。（为避免依赖冲突，请激活一个全新的虚拟环境）
+```bash
+pip install magic-pdf
+pip install unimernet==0.1.0
+pip install matplotlib ultralytics paddleocr==2.7.3 paddlepaddle
+```
+
+### 4.在部分较新的M芯片macOS设备上，MPS加速开启失败
+
+卸载torch和torchvision，重新安装nightly构建版torch和torchvision
+```bash
+pip uninstall torch torchvision
+pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
+```
+参考: https://github.com/opendatalab/PDF-Extract-Kit/issues/23
+
+### 5.使用过程中遇到paddle相关的报错FatalError: Illegal instruction is detected by the operating system.
+
+paddlepaddle 2.6.1与部分linux系统环境存在兼容性问题。
+可尝试降级到2.5.2使用，
+```bash
+pip install paddlepaddle==2.5.2
+```
+或卸载paddlepaddle，重新安装paddlepaddle-gpu
+```bash
+pip uninstall paddlepaddle
+pip install paddlepaddle-gpu
+```
+参考：https://github.com/opendatalab/MinerU/issues/146
+
+### 6.使用过程中遇到_pickle.UnpicklingError: invalid load key, 'v'.错误
+
+可能是由于模型文件未下载完整导致，可尝试重现下载模型文件后再试  
+参考：https://github.com/opendatalab/MinerU/issues/143
+
+### 7.程序运行完成后，找不到tmp目录
+
+程序输出目录是在"magic-pdf.json"中通过
+```json
+{
+  "temp-output-dir": "/tmp"
+}
+```
+进行配置的。  
+如果没有更改这个参数，使用默认的配置执行程序，在linux/macOS会在绝对路径"/tmp"下创建一个"magic-pdf"文件夹作为输出路径。
+而在windows下，默认的输出路径与执行命令时，命令行所在的盘符相关，如果命令行在C盘，则默认输出路径为"C://tmp/magic-pdf"。  
+参考：https://github.com/opendatalab/MinerU/issues/149
+
+### 8.模型文件应该下载到哪里/models-dir的配置应该怎么填
+
+模型文件的路径输入是在"magic-pdf.json"中通过
+```json
+{
+  "models-dir": "/tmp/models"
+}
+```
+进行配置的。
+这个路径是绝对路径而不是相对路径，绝对路径的获取可在models目录中通过命令 "pwd" 获取。  
+参考：https://github.com/opendatalab/MinerU/issues/155#issuecomment-2230216874
+
+### 9.命令行中 --model "model_json_path" 指的是什么？
+
+model_json 指的是通过模型分析后生成的一种有特定格式的json文件。  
+如果使用 https://github.com/opendatalab/PDF-Extract-Kit 项目生成，该文件一般在项目的output目录下。  
+如果使用 MinerU 的命令行调用内置的模型分析，该文件一般在输出路径"/tmp/magic-pdf/pdf-name"下。  
+参考：https://github.com/opendatalab/MinerU/issues/128
\ No newline at end of file
diff --git a/docs/how_to_download_models_en.md b/docs/how_to_download_models_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..7ae4982418dde1b3544c4161cfec3af6dd183007
--- /dev/null
+++ b/docs/how_to_download_models_en.md
@@ -0,0 +1,60 @@
+### Install Git LFS
+Before you begin, make sure Git Large File Storage (Git LFS) is installed on your system. Install it using the following command:
+
+```bash
+git lfs install
+```
+
+### Download the Model from Hugging Face
+To download the `PDF-Extract-Kit` model from Hugging Face, use the following command:
+
+```bash
+git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
+```
+
+Ensure that Git LFS is enabled during the clone to properly download all large files.
+
+
+
+### Download the Model from ModelScope
+
+#### SDK Download
+
+```bash
+# First, install the ModelScope library using pip:
+pip install modelscope
+```
+
+```python
+# Use the following Python code to download the model using the ModelScope SDK:
+from modelscope import snapshot_download
+model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
+```
+
+#### Git Download
+Alternatively, you can use Git to clone the model repository from ModelScope:
+
+```bash
+git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
+```
+
+
+Put [model files]() here:
+
+```
+./
+├── Layout
+│   ├── config.json
+│   └── weights.pth
+├── MFD
+│   └── weights.pt
+├── MFR
+│   └── UniMERNet
+│       ├── config.json
+│       ├── preprocessor_config.json
+│       ├── pytorch_model.bin
+│       ├── README.md
+│       ├── tokenizer_config.json
+│       └── tokenizer.json
+└── README.md
+```
\ No newline at end of file
diff --git a/docs/how_to_download_models_zh_cn.md b/docs/how_to_download_models_zh_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..d35a33360594139c50ba4a80747f37bab476b0a8
--- /dev/null
+++ b/docs/how_to_download_models_zh_cn.md
@@ -0,0 +1,61 @@
+### 安装 Git LFS
+开始之前，请确保您的系统上已安装 Git 大文件存储 (Git LFS)。使用以下命令进行安装
+
+```bash
+git lfs install
+```
+
+### 从 Hugging Face 下载模型
+请使用以下命令从 Hugging Face 下载 PDF-Extract-Kit 模型：
+
+```bash
+git lfs clone https://huggingface.co/wanderkid/PDF-Extract-Kit
+```
+
+确保在克隆过程中启用了 Git LFS，以便正确下载所有大文件。
+
+
+### 从 ModelScope 下载模型
+
+#### SDK下载
+
+```bash
+# 首先安装modelscope
+pip install modelscope
+```
+
+```python
+# 使用modelscope sdk下载模型
+from modelscope import snapshot_download
+model_dir = snapshot_download('wanderkid/PDF-Extract-Kit')
+```
+
+#### Git下载
+也可以使用git clone从 ModelScope 下载模型:
+
+```bash
+git clone https://www.modelscope.cn/wanderkid/PDF-Extract-Kit.git
+```
+
+
+将 'models' 目录移动到具有较大磁盘空间的目录中，最好是在固态硬盘(SSD)上。
+
+
+模型文件夹的结构如下，包含了不同组件的配置文件和权重文件：
+```
+./
+├── Layout
+│   ├── config.json
+│   └── model_final.pth
+├── MFD
+│   └── weights.pt
+├── MFR
+│   └── UniMERNet
+│       ├── config.json
+│       ├── preprocessor_config.json
+│       ├── pytorch_model.bin
+│       ├── README.md
+│       ├── tokenizer_config.json
+│       └── tokenizer.json
+└── README.md
+```
diff --git a/docs/images/flowchart_en.png b/docs/images/flowchart_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..b490011ea7a9edebeb9edbf3980fc971d89bb76a
Binary files /dev/null and b/docs/images/flowchart_en.png differ
diff --git a/docs/images/flowchart_zh_cn.png b/docs/images/flowchart_zh_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..32e0a14233972666cf4507af178c77cfe41e3311
Binary files /dev/null and b/docs/images/flowchart_zh_cn.png differ
diff --git a/docs/images/project_panorama_en.png b/docs/images/project_panorama_en.png
new file mode 100644
index 0000000000000000000000000000000000000000..19616da641076c037c47919c0d3de9efb8e409da
Binary files /dev/null and b/docs/images/project_panorama_en.png differ
diff --git a/docs/images/project_panorama_zh_cn.png b/docs/images/project_panorama_zh_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..3cd6843e05b827cb0a7c9cc31855de0d3f4645c5
Binary files /dev/null and b/docs/images/project_panorama_zh_cn.png differ
diff --git a/magic-pdf.template.json b/magic-pdf.template.json
new file mode 100644
index 0000000000000000000000000000000000000000..2c0223db0deb5e54543b92e073bd598e60047d73
--- /dev/null
+++ b/magic-pdf.template.json
@@ -0,0 +1,9 @@
+{
+    "bucket_info":{
+        "bucket-name-1":["ak", "sk", "endpoint"],
+        "bucket-name-2":["ak", "sk", "endpoint"]
+    },
+    "temp-output-dir":"/tmp",
+    "models-dir":"/tmp/models",
+    "device-mode":"cpu"
+}
\ No newline at end of file
diff --git a/magic_pdf/__init__.py b/magic_pdf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/cli/__init__.py b/magic_pdf/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/cli/magicpdf.py b/magic_pdf/cli/magicpdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ff6bc8b50edf47e82e03d8f5d4b4b7c9311767f
--- /dev/null
+++ b/magic_pdf/cli/magicpdf.py
@@ -0,0 +1,359 @@
+"""
+这里实现2个click命令：
+第一个：
+ 接收一个完整的s3路径，例如：s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
+    1）根据~/magic-pdf.json里的ak,sk等，构造s3cliReader读取到这个jsonl的对应行，返回json对象。
+    2）根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint，构造出s3cliReader用来读取pdf
+    3）从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter，用来保存截图
+    4）从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter，用来读写本地文件
+    
+    最后把以上步骤准备好的对象传入真正的解析API
+    
+第二个：
+  接收1）pdf的本地路径。2）模型json文件（可选）。然后：
+    1）根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置，构造出LocalImageWriter，用来保存截图
+    2）从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter，用来读写本地文件
+    3）根据约定，根据pdf本地路径，推导出pdf模型的json，并读入
+    
+
+效果：
+python magicpdf.py json-command --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
+python magicpdf.py pdf-command --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
+"""
+
+import os
+import json as json_parse
+import click
+from loguru import logger
+from pathlib import Path
+from magic_pdf.libs.version import __version__
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.pipe.OCRPipe import OCRPipe
+from magic_pdf.pipe.TXTPipe import TXTPipe
+from magic_pdf.libs.path_utils import (
+    parse_s3path,
+    parse_s3_range_params,
+    remove_non_official_s3_args,
+)
+from magic_pdf.libs.config_reader import (
+    get_local_dir,
+    get_s3_config,
+)
+from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+import csv
+import copy
+import magic_pdf.model as model_config
+
+parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
+
+
+def prepare_env(pdf_file_name, method):
+    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
+
+    local_image_dir = os.path.join(str(local_parent_dir), "images")
+    local_md_dir = local_parent_dir
+    os.makedirs(local_image_dir, exist_ok=True)
+    os.makedirs(local_md_dir, exist_ok=True)
+    return local_image_dir, local_md_dir
+
+
+def write_to_csv(csv_file_path, csv_data):
+    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
+        # 创建csv writer对象
+        csv_writer = csv.writer(csvfile)
+        # 写入数据
+        csv_writer.writerow(csv_data)
+    logger.info(f"数据已成功追加到 '{csv_file_path}'")
+
+
+def do_parse(
+        pdf_file_name,
+        pdf_bytes,
+        model_list,
+        parse_method,
+        f_draw_span_bbox=True,
+        f_draw_layout_bbox=True,
+        f_dump_md=True,
+        f_dump_middle_json=True,
+        f_dump_model_json=True,
+        f_dump_orig_pdf=True,
+        f_dump_content_list=True,
+        f_make_md_mode=MakeMode.MM_MD,
+):
+
+    orig_model_list = copy.deepcopy(model_list)
+
+    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
+    logger.info(f"local output dir is {local_md_dir}")
+    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
+    image_dir = str(os.path.basename(local_image_dir))
+
+    if parse_method == "auto":
+        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
+        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
+    elif parse_method == "txt":
+        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
+    elif parse_method == "ocr":
+        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
+    else:
+        logger.error("unknown parse method")
+        exit(1)
+
+    pipe.pipe_classify()
+
+    """如果没有传入有效的模型数据，则使用内置model解析"""
+    if len(model_list) == 0:
+        if model_config.__use_inside_model__:
+            pipe.pipe_analyze()
+            orig_model_list = copy.deepcopy(pipe.model_list)
+        else:
+            logger.error("need model list input")
+            exit(1)
+
+    pipe.pipe_parse()
+    pdf_info = pipe.pdf_mid_data["pdf_info"]
+    if f_draw_layout_bbox:
+        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
+    if f_draw_span_bbox:
+        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
+
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
+    if f_dump_md:
+        """写markdown"""
+        md_writer.write(
+            content=md_content,
+            path=f"{pdf_file_name}.md",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+    if f_dump_middle_json:
+        """写middle_json"""
+        md_writer.write(
+            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
+            path=f"{pdf_file_name}_middle.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+    if f_dump_model_json:
+        """写model_json"""
+        md_writer.write(
+            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
+            path=f"{pdf_file_name}_model.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+    if f_dump_orig_pdf:
+        """写源pdf"""
+        md_writer.write(
+            content=pdf_bytes,
+            path=f"{pdf_file_name}_origin.pdf",
+            mode=AbsReaderWriter.MODE_BIN,
+        )
+
+    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
+    if f_dump_content_list:
+        """写content_list"""
+        md_writer.write(
+            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
+            path=f"{pdf_file_name}_content_list.json",
+            mode=AbsReaderWriter.MODE_TXT,
+        )
+
+
+@click.group()
+@click.version_option(__version__, "--version", "-v", help="显示版本信息")
+@click.help_option("--help", "-h", help="显示帮助信息")
+def cli():
+    pass
+
+
+@cli.command()
+@click.option("--json", type=str, help="输入一个S3路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
+@click.option("--model_mode", type=click.STRING, default="full",
+              help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def json_command(json, method, inside_model, model_mode):
+    model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
+
+    if not json.startswith("s3://"):
+        logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
+        exit(1)
+
+    def read_s3_path(s3path):
+        bucket, key = parse_s3path(s3path)
+
+        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
+        s3_rw = S3ReaderWriter(
+            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
+        )
+        may_range_params = parse_s3_range_params(s3path)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_end = 0, None
+        else:
+            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+            byte_end += byte_start - 1
+        return s3_rw.read_jsonl(
+            remove_non_official_s3_args(s3path),
+            byte_start,
+            byte_end,
+            AbsReaderWriter.MODE_BIN,
+        )
+
+    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
+    s3_file_path = jso.get("file_location")
+    if s3_file_path is None:
+        s3_file_path = jso.get("path")
+    pdf_file_name = Path(s3_file_path).stem
+    pdf_data = read_s3_path(s3_file_path)
+
+    do_parse(
+        pdf_file_name,
+        pdf_data,
+        jso["doc_layout_result"],
+        method,
+    )
+
+
+@cli.command()
+@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
+@click.option("--model_mode", type=click.STRING, default="full",
+              help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def local_json_command(local_json, method, inside_model, model_mode):
+    model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
+
+    def read_s3_path(s3path):
+        bucket, key = parse_s3path(s3path)
+
+        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
+        s3_rw = S3ReaderWriter(
+            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
+        )
+        may_range_params = parse_s3_range_params(s3path)
+        if may_range_params is None or 2 != len(may_range_params):
+            byte_start, byte_end = 0, None
+        else:
+            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
+            byte_end += byte_start - 1
+        return s3_rw.read_jsonl(
+            remove_non_official_s3_args(s3path),
+            byte_start,
+            byte_end,
+            AbsReaderWriter.MODE_BIN,
+        )
+
+    with open(local_json, "r", encoding="utf-8") as f:
+        for json_line in f:
+            jso = json_parse.loads(json_line)
+
+            s3_file_path = jso.get("file_location")
+            if s3_file_path is None:
+                s3_file_path = jso.get("path")
+            pdf_file_name = Path(s3_file_path).stem
+            pdf_data = read_s3_path(s3_file_path)
+            do_parse(
+                pdf_file_name,
+                pdf_data,
+                jso["doc_layout_result"],
+                method,
+            )
+
+
+@cli.command()
+@click.option(
+    "--pdf", type=click.Path(exists=True), required=True,
+    help='pdf 文件路径, 支持单个文件或文件列表, 文件列表需要以".list"结尾, 一行一个pdf文件路径')
+@click.option("--model", type=click.Path(exists=True), help="模型的路径")
+@click.option(
+    "--method",
+    type=parse_pdf_methods,
+    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
+    default="auto",
+)
+@click.option("--inside_model", type=click.BOOL, default=True, help="使用内置模型测试")
+@click.option("--model_mode", type=click.STRING, default="full",
+              help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def pdf_command(pdf, model, method, inside_model, model_mode):
+    model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
+
+    def read_fn(path):
+        disk_rw = DiskReaderWriter(os.path.dirname(path))
+        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
+
+    def get_model_json(model_path, doc_path):
+        # 这里处理pdf和模型相关的逻辑
+        if model_path is None:
+            file_name_without_extension, extension = os.path.splitext(doc_path)
+            if extension == ".pdf":
+                model_path = file_name_without_extension + ".json"
+            else:
+                raise Exception("pdf_path input error")
+            if not os.path.exists(model_path):
+                logger.warning(
+                    f"not found json {model_path} existed"
+                )
+                # 本地无模型数据则调用内置paddle分析，先传空list，在内部识别到空list再调用paddle
+                model_json = "[]"
+            else:
+                model_json = read_fn(model_path).decode("utf-8")
+        else:
+            model_json = read_fn(model_path).decode("utf-8")
+
+        return model_json
+
+    def parse_doc(doc_path):
+        try:
+            file_name = str(Path(doc_path).stem)
+            pdf_data = read_fn(doc_path)
+            jso = json_parse.loads(get_model_json(model, doc_path))
+
+            do_parse(
+                file_name,
+                pdf_data,
+                jso,
+                method,
+            )
+
+        except Exception as e:
+            logger.exception(e)
+
+    if not pdf:
+        logger.error(f"Error: Missing argument '--pdf'.")
+        exit(f"Error: Missing argument '--pdf'.")
+    else:
+        '''适配多个文档的list文件输入'''
+        if pdf.endswith(".list"):
+            with open(pdf, "r") as f:
+                for line in f.readlines():
+                    line = line.strip()
+                    parse_doc(line)
+        else:
+            '''适配单个文档的输入'''
+            parse_doc(pdf)
+
+
+if __name__ == "__main__":
+    """
+    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
+    """
+    cli()
diff --git a/magic_pdf/dict2md/__init__.py b/magic_pdf/dict2md/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/dict2md/mkcontent.py b/magic_pdf/dict2md/mkcontent.py
new file mode 100644
index 0000000000000000000000000000000000000000..049e290e5433f4f19c8a43c7ddb634dcab149ffc
--- /dev/null
+++ b/magic_pdf/dict2md/mkcontent.py
@@ -0,0 +1,397 @@
+import math
+from loguru import logger
+
+from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.ocr_content_type import ContentType
+
+TYPE_INLINE_EQUATION = ContentType.InlineEquation
+TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
+UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+
+
+@DeprecationWarning
+def mk_nlp_markdown_1(para_dict: dict):
+    """
+    对排序后的bboxes拼接内容
+    """
+    content_lst = []
+    for _, page_info in para_dict.items():
+        para_blocks = page_info.get("para_blocks")
+        if not para_blocks:
+            continue
+
+        for block in para_blocks:
+            item = block["paras"]
+            for _, p in item.items():
+                para_text = p["para_text"]
+                is_title = p["is_para_title"]
+                title_level = p['para_title_level']
+                md_title_prefix = "#"*title_level
+                if is_title:
+                    content_lst.append(f"{md_title_prefix} {para_text}")
+                else:
+                    content_lst.append(para_text)
+
+    content_text = "\n\n".join(content_lst)
+
+    return content_text
+
+
+
+# 找到目标字符串在段落中的索引
+def __find_index(paragraph, target):
+    index = paragraph.find(target)
+    if index != -1:
+        return index
+    else:
+        return None
+
+
+def __insert_string(paragraph, target, postion):
+    new_paragraph = paragraph[:postion] + target + paragraph[postion:] 
+    return new_paragraph
+
+
+def __insert_after(content, image_content, target):
+    """
+    在content中找到target，将image_content插入到target后面
+    """
+    index = content.find(target)
+    if index != -1:
+        content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
+    else:
+        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
+    return content
+
+def __insert_before(content, image_content, target):
+    """
+    在content中找到target，将image_content插入到target前面
+    """
+    index = content.find(target)
+    if index != -1:
+        content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
+    else:
+        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
+    return content
+
+
+@DeprecationWarning
+def mk_mm_markdown_1(para_dict: dict):
+    """拼装多模态markdown"""
+    content_lst = []
+    for _, page_info in para_dict.items():
+        page_lst = [] # 一个page内的段落列表
+        para_blocks = page_info.get("para_blocks")
+        pymu_raw_blocks = page_info.get("preproc_blocks")
+        
+        all_page_images = []
+        all_page_images.extend(page_info.get("images",[]))
+        all_page_images.extend(page_info.get("image_backup", []) )
+        all_page_images.extend(page_info.get("tables",[]))
+        all_page_images.extend(page_info.get("table_backup",[]) )
+        
+        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
+            for img in all_page_images:
+                page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
+            page_md = "\n\n".join(page_lst)
+            
+        else:
+            for block in para_blocks:
+                item = block["paras"]
+                for _, p in item.items():
+                    para_text = p["para_text"]
+                    is_title = p["is_para_title"]
+                    title_level = p['para_title_level']
+                    md_title_prefix = "#"*title_level
+                    if is_title:
+                        page_lst.append(f"{md_title_prefix} {para_text}")
+                    else:
+                        page_lst.append(para_text)
+                        
+            """拼装成一个页面的文本"""
+            page_md = "\n\n".join(page_lst)
+            """插入图片"""
+            for img in all_page_images:
+                imgbox = img['bbox']
+                img_content = f"![]({img['image_path']})"
+                # 先看在哪个block内
+                for block in pymu_raw_blocks:
+                    bbox = block['bbox']
+                    if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
+                        for l in block['lines']:
+                            line_box = l['bbox']
+                            if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的，插入line前面
+                                line_txt = "".join([s['text'] for s in l['spans']])
+                                page_md = __insert_before(page_md, img_content, line_txt)
+                                break
+                            break
+                        else:# 在行与行之间
+                            # 找到图片x0,y0与line的x0,y0最近的line
+                            min_distance = 100000
+                            min_line = None
+                            for l in block['lines']:
+                                line_box = l['bbox']
+                                distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
+                                if distance < min_distance:
+                                    min_distance = distance
+                                    min_line = l
+                            if min_line:
+                                line_txt = "".join([s['text'] for s in min_line['spans']])
+                                img_h = imgbox[3] - imgbox[1]
+                                if min_distance<img_h: # 文字在图片前面
+                                    page_md = __insert_after(page_md, img_content, line_txt)
+                                else:
+                                    page_md = __insert_before(page_md, img_content, line_txt)
+                            else:
+                                logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
+                else:# 应当在两个block之间
+                    # 找到上方最近的block，如果上方没有就找大下方最近的block
+                    top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
+                    if top_txt_block:
+                        line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
+                        page_md = __insert_after(page_md, img_content, line_txt)
+                    else:
+                        bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
+                        if bottom_txt_block:
+                            line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
+                            page_md = __insert_before(page_md, img_content, line_txt)
+                        else:
+                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
+                    
+        content_lst.append(page_md)
+                    
+    """拼装成全部页面的文本"""
+    content_text = "\n\n".join(content_lst)
+
+    return content_text
+
+
+def __insert_after_para(text, type, element, content_list):
+    """
+    在content_list中找到text，将image_path作为一个新的node插入到text后面
+    """
+    for i, c in enumerate(content_list):
+        content_type = c.get("type")
+        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
+            if type == "image":
+                content_node = {
+                    "type": "image",
+                    "img_path": element.get("image_path"),
+                    "img_alt": "",
+                    "img_title": "",
+                    "img_caption": "",
+                }
+            elif type == "table":
+                content_node = {
+                    "type": "table",
+                    "img_path": element.get("image_path"),
+                    "table_latex": element.get("text"),
+                    "table_title": "",
+                    "table_caption": "",
+                    "table_quality": element.get("quality"),
+                }
+            content_list.insert(i+1, content_node)
+            break
+    else:
+        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
+    
+
+
+def __insert_before_para(text, type, element, content_list):
+    """
+    在content_list中找到text，将image_path作为一个新的node插入到text前面
+    """
+    for i, c in enumerate(content_list):
+        content_type = c.get("type")
+        if content_type in  UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
+            if type == "image":
+                content_node = {
+                    "type": "image",
+                    "img_path": element.get("image_path"),
+                    "img_alt": "",
+                    "img_title": "",
+                    "img_caption": "",
+                }
+            elif type == "table":
+                content_node = {
+                    "type": "table",
+                    "img_path": element.get("image_path"),
+                    "table_latex": element.get("text"),
+                    "table_title": "",
+                    "table_caption": "",
+                    "table_quality": element.get("quality"),
+                }
+            content_list.insert(i, content_node)
+            break
+    else:
+        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
+         
+
+def mk_universal_format(pdf_info_list: list, img_buket_path):
+    """
+    构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
+    """
+    content_lst = []
+    for page_info in pdf_info_list:
+        page_lst = [] # 一个page内的段落列表
+        para_blocks = page_info.get("para_blocks")
+        pymu_raw_blocks = page_info.get("preproc_blocks")
+        
+        all_page_images = []
+        all_page_images.extend(page_info.get("images",[]))
+        all_page_images.extend(page_info.get("image_backup", []) )
+        # all_page_images.extend(page_info.get("tables",[]))
+        # all_page_images.extend(page_info.get("table_backup",[]) )
+        all_page_tables = []
+        all_page_tables.extend(page_info.get("tables", []))
+
+        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
+            for img in all_page_images:
+                content_node = {
+                    "type": "image",
+                    "img_path": join_path(img_buket_path, img['image_path']),
+                    "img_alt":"",
+                    "img_title":"",
+                    "img_caption":""
+                }
+                page_lst.append(content_node) # TODO 图片顺序
+            for table in all_page_tables:
+                content_node = {
+                    "type": "table",
+                    "img_path": join_path(img_buket_path, table['image_path']),
+                    "table_latex": table.get("text"),
+                    "table_title": "",
+                    "table_caption": "",
+                    "table_quality": table.get("quality"),
+                }
+                page_lst.append(content_node) # TODO 图片顺序
+        else:
+            for block in para_blocks:
+                item = block["paras"]
+                for _, p in item.items():
+                    font_type = p['para_font_type']# 对于文本来说，要么是普通文本，要么是个行间公式
+                    if font_type == TYPE_INTERLINE_EQUATION:
+                        content_node = {
+                            "type": "equation",
+                            "latex": p["para_text"]
+                        }
+                        page_lst.append(content_node)
+                    else:
+                        para_text = p["para_text"]
+                        is_title = p["is_para_title"]
+                        title_level = p['para_title_level']
+                        
+                        if is_title:
+                            content_node = {
+                                "type": f"h{title_level}",
+                                "text": para_text
+                            }
+                            page_lst.append(content_node)
+                        else:
+                            content_node = {
+                                "type": "text",
+                                "text": para_text
+                            }
+                            page_lst.append(content_node)
+                            
+        content_lst.extend(page_lst)
+        
+        """插入图片"""
+        for img in all_page_images:
+            insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
+
+        """插入表格"""
+        for table in all_page_tables:
+            insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
+    # end for
+    return content_lst
+
+
+def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
+    element_bbox = element['bbox']
+    # 先看在哪个block内
+    for block in pymu_raw_blocks:
+        bbox = block['bbox']
+        if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
+            3] + 1:  # 确定在这个大的block内，然后进入逐行比较距离
+            for l in block['lines']:
+                line_box = l['bbox']
+                if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
+                    3] + 1:  # 在line内的，插入line前面
+                    line_txt = "".join([s['text'] for s in l['spans']])
+                    __insert_before_para(line_txt, type, element, content_lst)
+                    break
+                break
+            else:  # 在行与行之间
+                # 找到图片x0,y0与line的x0,y0最近的line
+                min_distance = 100000
+                min_line = None
+                for l in block['lines']:
+                    line_box = l['bbox']
+                    distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
+                    if distance < min_distance:
+                        min_distance = distance
+                        min_line = l
+                if min_line:
+                    line_txt = "".join([s['text'] for s in min_line['spans']])
+                    img_h = element_bbox[3] - element_bbox[1]
+                    if min_distance < img_h:  # 文字在图片前面
+                        __insert_after_para(line_txt, type, element, content_lst)
+                    else:
+                        __insert_before_para(line_txt, type, element, content_lst)
+                    break
+                else:
+                    logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
+    else:  # 应当在两个block之间
+        # 找到上方最近的block，如果上方没有就找大下方最近的block
+        top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
+        if top_txt_block:
+            line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
+            __insert_after_para(line_txt, type, element, content_lst)
+        else:
+            bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
+            if bottom_txt_block:
+                line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
+                __insert_before_para(line_txt, type, element, content_lst)
+            else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
+                logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
+
+
+def mk_mm_markdown(content_list):
+    """
+    基于同一格式的内容列表，构造markdown，含图片
+    """
+    content_md = []
+    for c in content_list:
+        content_type = c.get("type")
+        if content_type == "text":
+            content_md.append(c.get("text"))
+        elif content_type == "equation":
+            content = c.get("latex")
+            if content.startswith("$$") and content.endswith("$$"):
+                content_md.append(content)
+            else:
+                content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
+        elif content_type in UNI_FORMAT_TEXT_TYPE:
+            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
+        elif content_type == "image":
+            content_md.append(f"![]({c.get('img_path')})")
+    return "\n\n".join(content_md)
+
+def mk_nlp_markdown(content_list):
+    """
+    基于同一格式的内容列表，构造markdown，不含图片
+    """
+    content_md = []
+    for c in content_list:
+        content_type = c.get("type")
+        if content_type == "text":
+            content_md.append(c.get("text"))
+        elif content_type == "equation":
+            content_md.append(f"$$\n{c.get('latex')}\n$$")
+        elif content_type == "table":
+            content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
+        elif content_type in UNI_FORMAT_TEXT_TYPE:
+            content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
+    return "\n\n".join(content_md)
\ No newline at end of file
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaadcdeb9b6d5a926f91e167d7c6a1b7ac0cd984
--- /dev/null
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -0,0 +1,363 @@
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.language import detect_lang
+from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+import wordninja
+import re
+
+
+def split_long_words(text):
+    segments = text.split(' ')
+    for i in range(len(segments)):
+        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
+        for j in range(len(words)):
+            if len(words[j]) > 15:
+                words[j] = ' '.join(wordninja.split(words[j]))
+        segments[i] = ''.join(words)
+    return ' '.join(segments)
+
+
+def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
+    markdown = []
+    for page_info in pdf_info_list:
+        paras_of_layout = page_info.get("para_blocks")
+        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+        markdown.extend(page_markdown)
+    return '\n\n'.join(markdown)
+
+
+def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
+    markdown = []
+    for page_info in pdf_info_dict:
+        paras_of_layout = page_info.get("para_blocks")
+        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
+        markdown.extend(page_markdown)
+    return '\n\n'.join(markdown)
+
+
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
+    markdown_with_para_and_pagination = []
+    page_no = 0
+    for page_info in pdf_info_dict:
+        paras_of_layout = page_info.get("para_blocks")
+        if not paras_of_layout:
+            continue
+        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+        markdown_with_para_and_pagination.append({
+            'page_no': page_no,
+            'md_content': '\n\n'.join(page_markdown)
+        })
+        page_no += 1
+    return markdown_with_para_and_pagination
+
+
+def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
+    page_markdown = []
+    for paras in paras_of_layout:
+        for para in paras:
+            para_text = ''
+            for line in para:
+                for span in line['spans']:
+                    span_type = span.get('type')
+                    content = ''
+                    language = ''
+                    if span_type == ContentType.Text:
+                        content = span['content']
+                        language = detect_lang(content)
+                        if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                            content = ocr_escape_special_markdown_char(split_long_words(content))
+                        else:
+                            content = ocr_escape_special_markdown_char(content)
+                    elif span_type == ContentType.InlineEquation:
+                        content = f"${span['content']}$"
+                    elif span_type == ContentType.InterlineEquation:
+                        content = f"\n$$\n{span['content']}\n$$\n"
+                    elif span_type in [ContentType.Image, ContentType.Table]:
+                        if mode == 'mm':
+                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
+                        elif mode == 'nlp':
+                            pass
+                    if content != '':
+                        if language == 'en':  # 英文语境下 content间需要空格分隔
+                            para_text += content + ' '
+                        else:  # 中文语境下，content间不需要空格分隔
+                            para_text += content
+            if para_text.strip() == '':
+                continue
+            else:
+                page_markdown.append(para_text.strip() + '  ')
+    return page_markdown
+
+
+def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
+    page_markdown = []
+    for para_block in paras_of_layout:
+        para_text = ''
+        para_type = para_block['type']
+        if para_type == BlockType.Text:
+            para_text = merge_para_with_text(para_block)
+        elif para_type == BlockType.Title:
+            para_text = f"# {merge_para_with_text(para_block)}"
+        elif para_type == BlockType.InterlineEquation:
+            para_text = merge_para_with_text(para_block)
+        elif para_type == BlockType.Image:
+            if mode == 'nlp':
+                continue
+            elif mode == 'mm':
+                for block in para_block['blocks']:  # 1st.拼image_body
+                    if block['type'] == BlockType.ImageBody:
+                        for line in block['lines']:
+                            for span in line['spans']:
+                                if span['type'] == ContentType.Image:
+                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                for block in para_block['blocks']:  # 2nd.拼image_caption
+                    if block['type'] == BlockType.ImageCaption:
+                        para_text += merge_para_with_text(block)
+        elif para_type == BlockType.Table:
+            if mode == 'nlp':
+                continue
+            elif mode == 'mm':
+                for block in para_block['blocks']:  # 1st.拼table_caption
+                    if block['type'] == BlockType.TableCaption:
+                        para_text += merge_para_with_text(block)
+                for block in para_block['blocks']:  # 2nd.拼table_body
+                    if block['type'] == BlockType.TableBody:
+                        for line in block['lines']:
+                            for span in line['spans']:
+                                if span['type'] == ContentType.Table:
+                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                for block in para_block['blocks']:  # 3rd.拼table_footnote
+                    if block['type'] == BlockType.TableFootnote:
+                        para_text += merge_para_with_text(block)
+
+        if para_text.strip() == '':
+            continue
+        else:
+            page_markdown.append(para_text.strip() + '  ')
+
+    return page_markdown
+
+
+def merge_para_with_text(para_block):
+    para_text = ''
+    for line in para_block['lines']:
+        line_text = ""
+        line_lang = ""
+        for span in line['spans']:
+            span_type = span['type']
+            if span_type == ContentType.Text:
+                line_text += span['content'].strip()
+        if line_text != "":
+            line_lang = detect_lang(line_text)
+        for span in line['spans']:
+            span_type = span['type']
+            content = ''
+            if span_type == ContentType.Text:
+                content = span['content']
+                language = detect_lang(content)
+                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                    content = ocr_escape_special_markdown_char(split_long_words(content))
+                else:
+                    content = ocr_escape_special_markdown_char(content)
+            elif span_type == ContentType.InlineEquation:
+                content = f"${span['content']}$"
+            elif span_type == ContentType.InterlineEquation:
+                content = f"\n$$\n{span['content']}\n$$\n"
+
+            if content != '':
+                if 'zh' in line_lang:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
+                    para_text += content  # 中文语境下，content间不需要空格分隔
+                else:
+                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
+    return para_text
+
+
+def para_to_standard_format(para, img_buket_path):
+    para_content = {}
+    if len(para) == 1:
+        para_content = line_to_standard_format(para[0], img_buket_path)
+    elif len(para) > 1:
+        para_text = ''
+        inline_equation_num = 0
+        for line in para:
+            for span in line['spans']:
+                language = ''
+                span_type = span.get('type')
+                content = ""
+                if span_type == ContentType.Text:
+                    content = span['content']
+                    language = detect_lang(content)
+                    if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
+                        content = ocr_escape_special_markdown_char(split_long_words(content))
+                    else:
+                        content = ocr_escape_special_markdown_char(content)
+                elif span_type == ContentType.InlineEquation:
+                    content = f"${span['content']}$"
+                    inline_equation_num += 1
+
+                if language == 'en':  # 英文语境下 content间需要空格分隔
+                    para_text += content + ' '
+                else:  # 中文语境下，content间不需要空格分隔
+                    para_text += content
+        para_content = {
+            'type': 'text',
+            'text': para_text,
+            'inline_equation_num': inline_equation_num
+        }
+    return para_content
+
+
+def para_to_standard_format_v2(para_block, img_buket_path):
+    para_type = para_block['type']
+    if para_type == BlockType.Text:
+        para_content = {
+            'type': 'text',
+            'text': merge_para_with_text(para_block),
+        }
+    elif para_type == BlockType.Title:
+        para_content = {
+            'type': 'text',
+            'text': merge_para_with_text(para_block),
+            'text_level': 1
+        }
+    elif para_type == BlockType.InterlineEquation:
+        para_content = {
+            'type': 'equation',
+            'text': merge_para_with_text(para_block),
+            'text_format': "latex"
+        }
+    elif para_type == BlockType.Image:
+        para_content = {
+            'type': 'image',
+        }
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.ImageBody:
+                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+            if block['type'] == BlockType.ImageCaption:
+                para_content['img_caption'] = merge_para_with_text(block)
+    elif para_type == BlockType.Table:
+        para_content = {
+            'type': 'table',
+        }
+        for block in para_block['blocks']:
+            if block['type'] == BlockType.TableBody:
+                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+            if block['type'] == BlockType.TableCaption:
+                para_content['table_caption'] = merge_para_with_text(block)
+            if block['type'] == BlockType.TableFootnote:
+                para_content['table_footnote'] = merge_para_with_text(block)
+
+    return para_content
+
+
+def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
+    content_list = []
+    for page_info in pdf_info_dict:
+        paras_of_layout = page_info.get("para_blocks")
+        if not paras_of_layout:
+            continue
+        for para_block in paras_of_layout:
+            para_content = para_to_standard_format_v2(para_block, img_buket_path)
+            content_list.append(para_content)
+    return content_list
+
+
+def line_to_standard_format(line, img_buket_path):
+    line_text = ""
+    inline_equation_num = 0
+    for span in line['spans']:
+        if not span.get('content'):
+            if not span.get('image_path'):
+                continue
+            else:
+                if span['type'] == ContentType.Image:
+                    content = {
+                        'type': 'image',
+                        'img_path': join_path(img_buket_path, span['image_path'])
+                    }
+                    return content
+                elif span['type'] == ContentType.Table:
+                    content = {
+                        'type': 'table',
+                        'img_path': join_path(img_buket_path, span['image_path'])
+                    }
+                    return content
+        else:
+            if span['type'] == ContentType.InterlineEquation:
+                interline_equation = span['content']
+                content = {
+                    'type': 'equation',
+                    'latex': f"$$\n{interline_equation}\n$$"
+                }
+                return content
+            elif span['type'] == ContentType.InlineEquation:
+                inline_equation = span['content']
+                line_text += f"${inline_equation}$"
+                inline_equation_num += 1
+            elif span['type'] == ContentType.Text:
+                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
+                line_text += text_content
+    content = {
+        'type': 'text',
+        'text': line_text,
+        'inline_equation_num': inline_equation_num
+    }
+    return content
+
+
+def ocr_mk_mm_standard_format(pdf_info_dict: list):
+    """
+    content_list
+    type         string      image/text/table/equation(行间的单独拿出来，行内的和text合并)
+    latex        string      latex文本字段。
+    text         string      纯文本格式的文本数据。
+    md           string      markdown格式的文本数据。
+    img_path     string      s3://full/path/to/img.jpg
+    """
+    content_list = []
+    for page_info in pdf_info_dict:
+        blocks = page_info.get("preproc_blocks")
+        if not blocks:
+            continue
+        for block in blocks:
+            for line in block['lines']:
+                content = line_to_standard_format(line)
+                content_list.append(content)
+    return content_list
+
+
+def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
+    output_content = []
+    for page_info in pdf_info_dict:
+        if page_info.get("need_drop", False):
+            drop_reason = page_info.get("drop_reason")
+            if drop_mode == DropMode.NONE:
+                pass
+            elif drop_mode == DropMode.WHOLE_PDF:
+                raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
+            elif drop_mode == DropMode.SINGLE_PAGE:
+                logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
+                continue
+            else:
+                raise Exception(f"drop_mode can not be null")
+
+        paras_of_layout = page_info.get("para_blocks")
+        if not paras_of_layout:
+            continue
+        if make_mode == MakeMode.MM_MD:
+            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+            output_content.extend(page_markdown)
+        elif make_mode == MakeMode.NLP_MD:
+            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
+            output_content.extend(page_markdown)
+        elif make_mode == MakeMode.STANDARD_FORMAT:
+            for para_block in paras_of_layout:
+                para_content = para_to_standard_format_v2(para_block, img_buket_path)
+                output_content.append(para_content)
+    if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
+        return '\n\n'.join(output_content)
+    elif make_mode == MakeMode.STANDARD_FORMAT:
+        return output_content
diff --git a/magic_pdf/filter/__init__.py b/magic_pdf/filter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/filter/pdf_classify_by_type.py b/magic_pdf/filter/pdf_classify_by_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..319fb3fb20273187507fa66a10cdc0d61e99d500
--- /dev/null
+++ b/magic_pdf/filter/pdf_classify_by_type.py
@@ -0,0 +1,393 @@
+"""
+根据利用meta_scan得到的结果，对pdf是否为文字版进行分类。
+定义标准：
+一、什么pdf会是文字pdf，只要满足以下任意一条
+  1. 随机抽取N页，如果有任何一页文字数目大于100
+  2. 只要存在一个页面，图片的数量为0
+二、什么是扫描版pdf，只要满足以下任意一条
+  1. ~~80%页面上的最大图大小一样并且面积超过页面面积0.6~~
+  2. 大部分页面上文字的长度都是相等的。
+
+"""
+import json
+import sys
+from collections import Counter
+
+import click
+import numpy as np
+from loguru import logger
+
+from magic_pdf.libs.commons import mymax, get_top_percent_list
+from magic_pdf.filter.pdf_meta_scan import scan_max_page, junk_limit_min
+
+TEXT_LEN_THRESHOLD = 100
+AVG_TEXT_LEN_THRESHOLD = 100
+TEXT_LEN_SAMPLE_RATIO = 0.1  # 抽取0.1的页面进行文字长度统计
+
+
+# 一个拼接图片的方案，将某些特殊扫描版本的拆图拼成一张整图
+def merge_images(image_list, page_width, page_height, max_offset=5, max_gap=2):
+    # 先通过set去除所有bbox重叠的图片数据
+    image_list_result = []
+    for page_images in image_list:
+        page_result = []
+        dedup = set()
+        for img in page_images:
+            x0, y0, x1, y1, img_bojid = img
+            if (x0, y0, x1, y1) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
+                continue
+            else:
+                dedup.add((x0, y0, x1, y1))
+                page_result.append([x0, y0, x1, y1, img_bojid])
+        image_list_result.append(page_result)
+
+    # 接下来，将同一页可拼接的图片进行合并
+    merged_images = []
+    for page_images in image_list_result:
+        if not page_images:
+            continue
+
+        # 先将同一页的图片从上到下，从左到右进行排序
+        page_images.sort(key=lambda img: (img[1], img[0]))
+
+        merged = [page_images[0]]
+
+        for img in page_images[1:]:
+            x0, y0, x1, y1, imgid = img
+
+            last_img = merged[-1]
+            last_x0, last_y0, last_x1, last_y1, last_imgid = last_img
+
+            # 单张图片宽或者高覆盖页面宽高的9成以上是拼图的一个前置条件
+            full_width = abs(x1 - x0) >= page_width * 0.9
+            full_height = abs(y1 - y0) >= page_height * 0.9
+
+            # 如果宽达标，检测是否能竖着拼
+            if full_width:
+                # 竖着拼需要满足两个前提，左右边界各偏移不能超过 max_offset，第一张图的下边界和第二张图的上边界偏移不能超过 max_gap
+                close1 = (last_x0 - max_offset) <= x0 <= (last_x0 + max_offset) and (last_x1 - max_offset) <= x1 <= (
+                            last_x1 + max_offset) and (last_y1 - max_gap) <= y0 <= (last_y1 + max_gap)
+
+            # 如果高达标，检测是否可以横着拼
+            if full_height:
+                # 横着拼需要满足两个前提，上下边界各偏移不能超过 max_offset，第一张图的右边界和第二张图的左边界偏移不能超过 max_gap
+                close2 = (last_y0 - max_offset) <= y0 <= (last_y0 + max_offset) and (last_y1 - max_offset) <= y1 <= (
+                            last_y1 + max_offset) and (last_x1 - max_gap) <= x0 <= (last_x1 + max_gap)
+
+            # Check if the image can be merged with the last image
+            if (full_width and close1) or (full_height and close2):
+                # Merge the image with the last image
+                merged[-1] = [min(x0, last_x0), min(y0, last_y0),
+                              max(x1, last_x1), max(y1, last_y1), imgid]
+            else:
+                # Add the image as a new image
+                merged.append(img)
+
+        merged_images.append(merged)
+
+    return merged_images
+
+
+def classify_by_area(total_page: int, page_width, page_height, img_sz_list, text_len_list: list):
+    """
+    80%页面上的最大图大小一样并且面积超过页面面积0.6则返回False，否则返回True
+    :param pdf_path:
+    :param total_page:
+    :param page_width:
+    :param page_height:
+    :param img_sz_list:
+    :return:
+    """
+    # # 只要有一页没有图片，那么就是文字pdf。但是同时还需要满足一个条件就是这个页面上同时不能有文字。发现过一些扫描版pdf，上面有一些空白页面，既没有图片也没有文字。
+    # if any([len(img_sz) == 0 for img_sz in img_sz_list]):  # 含有不含图片的页面
+    #     # 现在找到这些页面的index
+    #     empty_page_index = [i for i, img_sz in enumerate(img_sz_list) if len(img_sz) == 0]
+    #     # 然后检查这些页面上是否有文字
+    #     text_len_at_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in empty_page_index and text_len > 0]
+    #     if len(text_len_at_page_idx) > TEXT_LEN_THRESHOLD:  # 没有图片，但是有文字，说明可能是个文字版，如果没有文字则无法判断，留给下一步,现在要求这页文字量超过一定阈值
+    #         return True
+
+    # 通过objid去掉重复出现10次以上的图片，这些图片是隐藏的透明图层，其特点是id都一样
+    # 先对每个id出现的次数做个统计
+    objid_cnt = Counter([objid for page_img_sz in img_sz_list for _, _, _, _, objid in page_img_sz])
+    # 再去掉出现次数大于10的
+    if total_page >= scan_max_page:  # 新的meta_scan只扫描前 scan_max_page 页，页数大于 scan_max_page 当total_page为 scan_max_page
+        total_page = scan_max_page
+
+    repeat_threshold = 2  # 把bad_image的阈值设为2
+    # repeat_threshold = min(2, total_page)  # 当total_page为1时，repeat_threshold为1，会产生误判导致所有img变成bad_img
+    bad_image_objid = set([objid for objid, cnt in objid_cnt.items() if cnt >= repeat_threshold])
+    # bad_image_page_idx = [i for i, page_img_sz in enumerate(img_sz_list) if any([objid in bad_image_objid for _, _, _, _, objid in page_img_sz])]
+    # text_len_at_bad_image_page_idx = [text_len for i, text_len in enumerate(text_len_list) if i in bad_image_page_idx and text_len > 0]
+
+    # 特殊情况，一个文字版pdf，每页覆盖一个超大的透明图片,超大的定义是图片占整页面积的90%以上
+    # fake_image_ids = [objid for objid in bad_image_objid if
+    #                   any([abs((x1 - x0) * (y1 - y0) / page_width * page_height) > 0.9 for images in img_sz_list for
+    #                        x0, y0, x1, y1, _ in images])]  # 原来的代码，any里面恒为true了，原因？？？
+    # fake_image_ids = [objid for objid in bad_image_objid for images in img_sz_list for x0, y0, x1, y1, img_id in images
+    #                   if img_id == objid and abs((x1 - x0) * (y1 - y0)) / (page_width * page_height) > 0.9]
+
+    # if len(fake_image_ids) > 0 and any([l > TEXT_LEN_THRESHOLD for l in text_len_at_bad_image_page_idx]):  # 这些透明图片所在的页面上有文字大于阈值
+    #     return True
+
+    img_sz_list = [[img_sz for img_sz in page_img_sz if img_sz[-1] not in bad_image_objid] for page_img_sz in
+                   img_sz_list]  # 过滤掉重复出现的图片
+
+    # 有的扫描版会把一页图片拆成很多张，需要先把图拼起来再计算
+    img_sz_list = merge_images(img_sz_list, page_width, page_height)
+
+    # 计算每个页面上最大的图的面积，然后计算这个面积占页面面积的比例
+    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+                               img_sz_list]
+    page_area = page_width * page_height
+    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
+    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.5]
+
+    if len(max_image_area_per_page) >= 0.5 * total_page:  # 阈值从0.8改到0.5，适配3页里面有两页和两页里面有一页的情况
+        # 这里条件成立的前提是把反复出现的图片去掉了。这些图片是隐藏的透明图层，其特点是id都一样
+        return False
+    else:
+        return True
+
+
+def classify_by_text_len(text_len_list: list, total_page: int):
+    """
+    随机抽取10%的页面，如果少于5个页面，那么就取全部页面。
+    查看页面上的文字长度，如果有任何一个页面的文字长度大于TEXT_LEN_THRESHOLD，那么就是文字pdf
+    :param total_page:
+    :param text_len_list:
+    :return:
+    """
+    select_page_cnt = int(total_page * TEXT_LEN_SAMPLE_RATIO)  # 选取10%的页面
+    if select_page_cnt < 5:
+        select_page_cnt = total_page
+
+    # # 排除头尾各10页
+    # if total_page > 20:  # 如果总页数大于20
+    #     page_range = list(range(10, total_page - 10))  # 从第11页到倒数第11页
+    # else:
+    #     page_range = list(range(total_page))  # 否则选择所有页面
+    # page_num = np.random.choice(page_range, min(select_page_cnt, len(page_range)), replace=False)
+    # 排除前后10页对只有21，22页的pdf很尴尬，如果选出来的中间那一两页恰好没字容易误判，有了avg_words规则，这个规则可以忽略
+    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
+    text_len_lst = [text_len_list[i] for i in page_num]
+    is_text_pdf = any([text_len > TEXT_LEN_THRESHOLD for text_len in text_len_lst])
+    return is_text_pdf
+
+
+def classify_by_avg_words(text_len_list: list):
+    """
+    补充规则，如果平均每页字数少于 AVG_TEXT_LEN_THRESHOLD，就不是文字pdf
+    主要是各种图集
+    :param text_len_list:
+    :return:
+    """
+    sum_words = sum(text_len_list)
+    count_of_numbers = len(text_len_list)
+    if count_of_numbers == 0:
+        is_text_pdf = False
+    else:
+        avg_words = round(sum_words / count_of_numbers)
+        if avg_words > AVG_TEXT_LEN_THRESHOLD:
+            is_text_pdf = True
+        else:
+            is_text_pdf = False
+
+    return is_text_pdf
+
+
+def classify_by_img_num(img_sz_list: list, img_num_list: list):
+    """
+    补充规则，有一种扫描版本的PDF，每一页都会放所有的扫描页进去，在 metascan 时会被去重，
+    这种pdf的 metasca 扫描结果的特点是 img_sz_list 内全是空元素，img_num_list中每一页的数量都很大且相同
+    :param img_sz_list:
+    :param img_num_list:
+    :return:
+    """
+    # 计算img_sz_list中非空元素的个数
+    count_img_sz_list_not_none = sum(1 for item in img_sz_list if item)
+    # 获取前80%的元素
+    top_eighty_percent = get_top_percent_list(img_num_list, 0.8)
+    # img_sz_list中非空元素的个数小于1，前80%的元素都相等，且最大值大于等于junk_limit_min
+    if count_img_sz_list_not_none <= 1 and len(set(top_eighty_percent)) == 1 and max(img_num_list) >= junk_limit_min:
+
+        #拿max和min的值,用来判断list内的值是否全都相等
+        # min_imgs = min(img_num_list)
+        # max_imgs = max(img_num_list)
+        #
+        # if count_img_sz_list_not_none == 0 and max_imgs == min_imgs and max_imgs >= junk_limit_min:
+        return False  # 如果满足这个条件，一定不是文字版pdf
+    else:
+        return True  # 不满足这三个条件，可能是文字版pdf，通过其他规则判断
+
+
+def classify_by_text_layout(text_layout_per_page: list):
+    """
+    判断文本布局是否以竖排为主。
+
+    Args:
+        text_layout_per_page (list): 文本布局列表，列表中的每个元素表示一页的文本布局，
+                                     值为'vertical'表示竖排，值为'horizontal'表示横排。
+
+    Returns:
+        bool: 若文本布局以竖排为主，则返回False；否则返回True。
+    """
+    # 统计text_layout_per_page中竖排的个数
+    count_vertical = sum(1 for item in text_layout_per_page if item == 'vertical')
+    # 统计text_layout_per_page中横排的个数
+    count_horizontal = sum(1 for item in text_layout_per_page if item == 'horizontal')
+    # 计算text_layout_per_page中竖排的占比
+    known_layout_cnt = count_vertical + count_horizontal
+    if known_layout_cnt != 0:
+        ratio = count_vertical / known_layout_cnt
+        if ratio >= 0.5:  # 阈值设为0.5，适配3页里面有2页和两页里有一页的情况
+            return False  # 文本布局以竖排为主，认为不是文字版pdf
+        else:
+            return True  # 文本布局以横排为主，认为是文字版pdf
+    else:
+        return False  # 文本布局未知，默认认为不是文字版pdf
+
+
+def classify_by_img_narrow_strips(page_width, page_height, img_sz_list):
+    """
+    判断一页是否由细长条组成，有两个条件：
+    1. 图片的宽或高达到页面宽或高的90%，且长边需要是窄边长度的数倍以上
+    2. 整个页面所有的图片有80%以上满足条件1
+
+    Args:
+        page_width (float): 页面宽度
+        page_height (float): 页面高度
+        img_sz_list (list): 图片尺寸列表，每个元素为一个元组，表示图片的矩形区域和尺寸，形如(x0, y0, x1, y1, size)，其中(x0, y0)为矩形区域的左上角坐标，(x1, y1)为矩形区域的右下角坐标，size为图片的尺寸
+
+    Returns:
+        bool: 如果满足条件的页面的比例小于0.5，返回True，否则返回False
+    """
+
+    def is_narrow_strip(img):
+        x0, y0, x1, y1, _ = img
+        width, height = x1 - x0, y1 - y0
+        return any([
+            # 图片宽度大于等于页面宽度的90%，且宽度大于等于高度4倍
+            width >= page_width * 0.9 and width >= height * 4,
+            # 图片高度大于等于页面高度的90%，且高度大于等于宽度4倍
+            height >= page_height * 0.9 and height >= width * 4,
+        ])
+
+    # 初始化满足条件的页面数量
+    narrow_strip_pages_count = 0
+
+    # 遍历所有页面
+    for page_img_list in img_sz_list:
+        # 忽略空页面
+        if not page_img_list:
+            continue
+
+        # 计算页面中的图片总数
+        total_images = len(page_img_list)
+
+        # 计算页面中细长条图片的数量
+        narrow_strip_images_count = 0
+        for img in page_img_list:
+            if is_narrow_strip(img):
+                narrow_strip_images_count += 1
+        # 如果细长条图片的数量少于5，跳过
+        if narrow_strip_images_count < 5:
+            continue
+        else:
+            # 如果细长条图片的比例大于或等于0.8，增加满足条件的页面数量
+            if narrow_strip_images_count / total_images >= 0.8:
+                narrow_strip_pages_count += 1
+
+    # 计算满足条件的页面的比例
+    narrow_strip_pages_ratio = narrow_strip_pages_count / len(img_sz_list)
+
+    return narrow_strip_pages_ratio < 0.5
+
+
+def classify(total_page: int, page_width, page_height, img_sz_list: list, text_len_list: list, img_num_list: list,
+             text_layout_list: list, invalid_chars: bool):
+    """
+    这里的图片和页面长度单位是pts
+    :param total_page:
+    :param text_len_list:
+    :param page_width:
+    :param page_height:
+    :param img_sz_list:
+    :param pdf_path:
+    :return:
+    """
+    results = {
+        'by_image_area': classify_by_area(total_page, page_width, page_height, img_sz_list, text_len_list),
+        'by_text_len': classify_by_text_len(text_len_list, total_page),
+        'by_avg_words': classify_by_avg_words(text_len_list),
+        'by_img_num': classify_by_img_num(img_sz_list, img_num_list),
+        'by_text_layout': classify_by_text_layout(text_layout_list),
+        'by_img_narrow_strips': classify_by_img_narrow_strips(page_width, page_height, img_sz_list),
+        'by_invalid_chars': invalid_chars,
+    }
+
+    if all(results.values()):
+        return True, results
+    elif not any(results.values()):
+        return False, results
+    else:
+        logger.warning(
+            f"pdf is not classified by area and text_len, by_image_area: {results['by_image_area']},"
+            f" by_text: {results['by_text_len']}, by_avg_words: {results['by_avg_words']}, by_img_num: {results['by_img_num']},"
+            f" by_text_layout: {results['by_text_layout']}, by_img_narrow_strips: {results['by_img_narrow_strips']},"
+            f" by_invalid_chars: {results['by_invalid_chars']}",
+            file=sys.stderr)  # 利用这种情况可以快速找出来哪些pdf比较特殊，针对性修正分类算法
+        return False, results
+
+
+@click.command()
+@click.option("--json-file", type=str, help="pdf信息")
+def main(json_file):
+    if json_file is None:
+        print("json_file is None", file=sys.stderr)
+        exit(0)
+    try:
+        with open(json_file, "r") as f:
+            for l in f:
+                if l.strip() == "":
+                    continue
+                o = json.loads(l)
+                total_page = o["total_page"]
+                page_width = o["page_width_pts"]
+                page_height = o["page_height_pts"]
+                img_sz_list = o["image_info_per_page"]
+                text_len_list = o['text_len_per_page']
+                text_layout_list = o['text_layout_per_page']
+                pdf_path = o['pdf_path']
+                is_encrypted = o['is_encrypted']
+                is_needs_password = o['is_needs_password']
+                if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+                    continue
+                tag = classify(total_page, page_width, page_height, img_sz_list, text_len_list, text_layout_list)
+                o['is_text_pdf'] = tag
+                print(json.dumps(o, ensure_ascii=False))
+    except Exception as e:
+        print("ERROR: ", e, file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
+    # false = False
+    # true = True
+    # null = None
+    # o = {"pdf_path":"s3://llm-raw-snew/llm-raw-the-eye/raw/World%20Tracker%20Library/worldtracker.org/media/library/Science/Computer%20Science/Shreiner%20-%20OpenGL%20Programming%20Guide%206e%20%5BThe%20Redbook%5D%20%28AW%2C%202008%29.pdf","is_needs_password":false,"is_encrypted":false,"total_page":978,"page_width_pts":368,"page_height_pts":513,"image_info_per_page":[[[0,0,368,513,10037]],[[0,0,368,513,4]],[[0,0,368,513,7]],[[0,0,368,513,10]],[[0,0,368,513,13]],[[0,0,368,513,16]],[[0,0,368,513,19]],[[0,0,368,513,22]],[[0,0,368,513,25]],[[0,0,368,513,28]],[[0,0,368,513,31]],[[0,0,368,513,34]],[[0,0,368,513,37]],[[0,0,368,513,40]],[[0,0,368,513,43]],[[0,0,368,513,46]],[[0,0,368,513,49]],[[0,0,368,513,52]],[[0,0,368,513,55]],[[0,0,368,513,58]],[[0,0,368,513,61]],[[0,0,368,513,64]],[[0,0,368,513,67]],[[0,0,368,513,70]],[[0,0,368,513,73]],[[0,0,368,516,76]],[[0,0,368,516,79]],[[0,0,368,513,82]],[[0,0,368,513,85]],[[0,0,368,513,88]],[[0,0,368,513,91]],[[0,0,368,513,94]],[[0,0,368,513,97]],[[0,0,368,513,100]],[[0,0,368,513,103]],[[0,0,368,513,106]],[[0,0,368,513,109]],[[0,0,368,513,112]],[[0,0,368,513,115]],[[0,0,368,513,118]],[[0,0,368,513,121]],[[0,0,368,513,124]],[[0,0,368,513,127]],[[0,0,368,513,130]],[[0,0,368,513,133]],[[0,0,368,513,136]],[[0,0,368,513,139]],[[0,0,368,513,142]],[[0,0,368,513,145]],[[0,0,368,513,148]],[[0,0,368,513,151]],[[0,0,368,513,154]],[[0,0,368,513,157]],[[0,0,368,513,160]],[[0,0,368,513,163]],[[0,0,368,513,166]],[[0,0,368,513,169]],[[0,0,368,513,172]],[[0,0,368,513,175]],[[0,0,368,513,178]],[[0,0,368,513,181]],[[0,0,368,513,184]],[[0,0,368,513,187]],[[0,0,368,513,190]],[[0,0,368,513,193]],[[0,0,368,513,196]],[[0,0,368,513,199]],[[0,0,368,513,202]],[[0,0,368,513,205]],[[0,0,368,513,208]],[[0,0,368,513,211]],[[0,0,368,513,214]],[[0,0,368,513,217]],[[0,0,368,513,220]],[[0,0,368,513,223]],[[0,0,368,513,226]],[[0,0,368,513,229]],[[0,0,368,513,232]],[[0,0,368,513,235]],[[0,0,368,513,238]],[[0,0,368,513,241]],[[0,0,368,513,244]],[[0,0,368,513,247]],[[0,0,368,513,250]],[[0,0,368,513,253]],[[0,0,368,513,256]],[[0,0,368,513,259]],[[0,0,368,513,262]],[[0,0,368,513,265]],[[0,0,368,513,268]],[[0,0,368,513,271]],[[0,0,368,513,274]],[[0,0,368,513,277]],[[0,0,368,513,280]],[[0,0,368,513,283]],[[0,0,368,513,286]],[[0,0,368,513,289]],[[0,0,368,513,292]],[[0,0,368,513,295]],[[0,0,368,513,298]],[[0,0,368,513,301]],[[0,0,368,513,304]],[[0,0,368,513,307]],[[0,0,368,513,310]],[[0,0,368,513,313]],[[0,0,368,513,316]],[[0,0,368,513,319]],[[0,0,368,513,322]],[[0,0,368,513,325]],[[0,0,368,513,328]],[[0,0,368,513,331]],[[0,0,368,513,334]],[[0,0,368,513,337]],[[0,0,368,513,340]],[[0,0,368,513,343]],[[0,0,368,513,346]],[[0,0,368,513,349]],[[0,0,368,513,352]],[[0,0,368,513,355]],[[0,0,368,513,358]],[[0,0,368,513,361]],[[0,0,368,513,364]],[[0,0,368,513,367]],[[0,0,368,513,370]],[[0,0,368,513,373]],[[0,0,368,513,376]],[[0,0,368,513,379]],[[0,0,368,513,382]],[[0,0,368,513,385]],[[0,0,368,513,388]],[[0,0,368,513,391]],[[0,0,368,513,394]],[[0,0,368,513,397]],[[0,0,368,513,400]],[[0,0,368,513,403]],[[0,0,368,513,406]],[[0,0,368,513,409]],[[0,0,368,513,412]],[[0,0,368,513,415]],[[0,0,368,513,418]],[[0,0,368,513,421]],[[0,0,368,513,424]],[[0,0,368,513,427]],[[0,0,368,513,430]],[[0,0,368,513,433]],[[0,0,368,513,436]],[[0,0,368,513,439]],[[0,0,368,513,442]],[[0,0,368,513,445]],[[0,0,368,513,448]],[[0,0,368,513,451]],[[0,0,368,513,454]],[[0,0,368,513,457]],[[0,0,368,513,460]],[[0,0,368,513,463]],[[0,0,368,513,466]],[[0,0,368,513,469]],[[0,0,368,513,472]],[[0,0,368,513,475]],[[0,0,368,513,478]],[[0,0,368,513,481]],[[0,0,368,513,484]],[[0,0,368,513,487]],[[0,0,368,513,490]],[[0,0,368,513,493]],[[0,0,368,513,496]],[[0,0,368,513,499]],[[0,0,368,513,502]],[[0,0,368,513,505]],[[0,0,368,513,508]],[[0,0,368,513,511]],[[0,0,368,513,514]],[[0,0,368,513,517]],[[0,0,368,513,520]],[[0,0,368,513,523]],[[0,0,368,513,526]],[[0,0,368,513,529]],[[0,0,368,513,532]],[[0,0,368,513,535]],[[0,0,368,513,538]],[[0,0,368,513,541]],[[0,0,368,513,544]],[[0,0,368,513,547]],[[0,0,368,513,550]],[[0,0,368,513,553]],[[0,0,368,513,556]],[[0,0,368,513,559]],[[0,0,368,513,562]],[[0,0,368,513,565]],[[0,0,368,513,568]],[[0,0,368,513,571]],[[0,0,368,513,574]],[[0,0,368,513,577]],[[0,0,368,513,580]],[[0,0,368,513,583]],[[0,0,368,513,586]],[[0,0,368,513,589]],[[0,0,368,513,592]],[[0,0,368,513,595]],[[0,0,368,513,598]],[[0,0,368,513,601]],[[0,0,368,513,604]],[[0,0,368,513,607]],[[0,0,368,513,610]],[[0,0,368,513,613]],[[0,0,368,513,616]],[[0,0,368,513,619]],[[0,0,368,513,622]],[[0,0,368,513,625]],[[0,0,368,513,628]],[[0,0,368,513,631]],[[0,0,368,513,634]],[[0,0,368,513,637]],[[0,0,368,513,640]],[[0,0,368,513,643]],[[0,0,368,513,646]],[[0,0,368,513,649]],[[0,0,368,513,652]],[[0,0,368,513,655]],[[0,0,368,513,658]],[[0,0,368,513,661]],[[0,0,368,513,664]],[[0,0,368,513,667]],[[0,0,368,513,670]],[[0,0,368,513,673]],[[0,0,368,513,676]],[[0,0,368,513,679]],[[0,0,368,513,682]],[[0,0,368,513,685]],[[0,0,368,513,688]],[[0,0,368,513,691]],[[0,0,368,513,694]],[[0,0,368,513,697]],[[0,0,368,513,700]],[[0,0,368,513,703]],[[0,0,368,513,706]],[[0,0,368,513,709]],[[0,0,368,513,712]],[[0,0,368,513,715]],[[0,0,368,513,718]],[[0,0,368,513,721]],[[0,0,368,513,724]],[[0,0,368,513,727]],[[0,0,368,513,730]],[[0,0,368,513,733]],[[0,0,368,513,736]],[[0,0,368,513,739]],[[0,0,368,513,742]],[[0,0,368,513,745]],[[0,0,368,513,748]],[[0,0,368,513,751]],[[0,0,368,513,754]],[[0,0,368,513,757]],[[0,0,368,513,760]],[[0,0,368,513,763]],[[0,0,368,513,766]],[[0,0,368,513,769]],[[0,0,368,513,772]],[[0,0,368,513,775]],[[0,0,368,513,778]],[[0,0,368,513,781]],[[0,0,368,513,784]],[[0,0,368,513,787]],[[0,0,368,513,790]],[[0,0,368,513,793]],[[0,0,368,513,796]],[[0,0,368,513,799]],[[0,0,368,513,802]],[[0,0,368,513,805]],[[0,0,368,513,808]],[[0,0,368,513,811]],[[0,0,368,513,814]],[[0,0,368,513,817]],[[0,0,368,513,820]],[[0,0,368,513,823]],[[0,0,368,513,826]],[[0,0,368,513,829]],[[0,0,368,513,832]],[[0,0,368,513,835]],[[0,0,368,513,838]],[[0,0,368,513,841]],[[0,0,368,513,844]],[[0,0,368,513,847]],[[0,0,368,513,850]],[[0,0,368,513,853]],[[0,0,368,513,856]],[[0,0,368,513,859]],[[0,0,368,513,862]],[[0,0,368,513,865]],[[0,0,368,513,868]],[[0,0,368,513,871]],[[0,0,368,513,874]],[[0,0,368,513,877]],[[0,0,368,513,880]],[[0,0,368,513,883]],[[0,0,368,513,886]],[[0,0,368,513,889]],[[0,0,368,513,892]],[[0,0,368,513,895]],[[0,0,368,513,898]],[[0,0,368,513,901]],[[0,0,368,513,904]],[[0,0,368,513,907]],[[0,0,368,513,910]],[[0,0,368,513,913]],[[0,0,368,513,916]],[[0,0,368,513,919]],[[0,0,368,513,922]],[[0,0,368,513,925]],[[0,0,368,513,928]],[[0,0,368,513,931]],[[0,0,368,513,934]],[[0,0,368,513,937]],[[0,0,368,513,940]],[[0,0,368,513,943]],[[0,0,368,513,946]],[[0,0,368,513,949]],[[0,0,368,513,952]],[[0,0,368,513,955]],[[0,0,368,513,958]],[[0,0,368,513,961]],[[0,0,368,513,964]],[[0,0,368,513,967]],[[0,0,368,513,970]],[[0,0,368,513,973]],[[0,0,368,513,976]],[[0,0,368,513,979]],[[0,0,368,513,982]],[[0,0,368,513,985]],[[0,0,368,513,988]],[[0,0,368,513,991]],[[0,0,368,513,994]],[[0,0,368,513,997]],[[0,0,368,513,1000]],[[0,0,368,513,1003]],[[0,0,368,513,1006]],[[0,0,368,513,1009]],[[0,0,368,513,1012]],[[0,0,368,513,1015]],[[0,0,368,513,1018]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,2797]],[[0,0,368,513,2798]],[[0,0,368,513,2799]],[[0,0,368,513,2800]],[[0,0,368,513,2801]],[[0,0,368,513,2802]],[[0,0,368,513,2803]],[[0,0,368,513,2804]],[[0,0,368,513,2805]],[[0,0,368,513,2806]],[[0,0,368,513,2807]],[[0,0,368,513,2808]],[[0,0,368,513,2809]],[[0,0,368,513,2810]],[[0,0,368,513,2811]],[[0,0,368,513,2812]],[[0,0,368,513,2813]],[[0,0,368,513,2814]],[[0,0,368,513,2815]],[[0,0,368,513,2816]],[[0,0,368,513,2817]],[[0,0,368,513,2818]],[[0,0,368,513,2819]],[[0,0,368,513,2820]],[[0,0,368,513,2821]],[[0,0,368,513,2822]],[[0,0,368,513,2823]],[[0,0,368,513,2824]],[[0,0,368,513,2825]],[[0,0,368,513,2826]],[[0,0,368,513,2827]],[[0,0,368,513,2828]],[[0,0,368,513,2829]],[[0,0,368,513,2830]],[[0,0,368,513,2831]],[[0,0,368,513,2832]],[[0,0,368,513,2833]],[[0,0,368,513,2834]],[[0,0,368,513,2835]],[[0,0,368,513,2836]],[[0,0,368,513,2837]],[[0,0,368,513,2838]],[[0,0,368,513,2839]],[[0,0,368,513,2840]],[[0,0,368,513,2841]],[[0,0,368,513,2842]],[[0,0,368,513,2843]],[[0,0,368,513,2844]],[[0,0,368,513,2845]],[[0,0,368,513,2846]],[[0,0,368,513,2847]],[[0,0,368,513,2848]],[[0,0,368,513,2849]],[[0,0,368,513,2850]],[[0,0,368,513,2851]],[[0,0,368,513,2852]],[[0,0,368,513,2853]],[[0,0,368,513,2854]],[[0,0,368,513,2855]],[[0,0,368,513,2856]],[[0,0,368,513,2857]],[[0,0,368,513,2858]],[[0,0,368,513,2859]],[[0,0,368,513,2860]],[[0,0,368,513,2861]],[[0,0,368,513,2862]],[[0,0,368,513,2863]],[[0,0,368,513,2864]],[[0,0,368,513,1293]],[[0,0,368,513,1296]],[[0,0,368,513,1299]],[[0,0,368,513,1302]],[[0,0,368,513,1305]],[[0,0,368,513,1308]],[[0,0,368,513,1311]],[[0,0,368,513,1314]],[[0,0,368,513,1317]],[[0,0,368,513,1320]],[[0,0,368,513,1323]],[[0,0,368,513,1326]],[[0,0,368,513,1329]],[[0,0,368,513,1332]],[[0,0,368,513,1335]],[[0,0,368,513,1338]],[[0,0,368,513,1341]],[[0,0,368,513,1344]],[[0,0,368,513,1347]],[[0,0,368,513,1350]],[[0,0,368,513,1353]],[[0,0,368,513,1356]],[[0,0,368,513,1359]],[[0,0,368,513,1362]],[[0,0,368,513,1365]],[[0,0,368,513,1368]],[[0,0,368,513,1371]],[[0,0,368,513,1374]],[[0,0,368,513,1377]],[[0,0,368,513,1380]],[[0,0,368,513,1383]],[[0,0,368,513,1386]],[[0,0,368,513,1389]],[[0,0,368,513,1392]],[[0,0,368,513,1395]],[[0,0,368,513,1398]],[[0,0,368,513,1401]],[[0,0,368,513,1404]],[[0,0,368,513,1407]],[[0,0,368,513,1410]],[[0,0,368,513,1413]],[[0,0,368,513,1416]],[[0,0,368,513,1419]],[[0,0,368,513,1422]],[[0,0,368,513,1425]],[[0,0,368,513,1428]],[[0,0,368,513,1431]],[[0,0,368,513,1434]],[[0,0,368,513,1437]],[[0,0,368,513,1440]],[[0,0,368,513,1443]],[[0,0,368,513,1446]],[[0,0,368,513,1449]],[[0,0,368,513,1452]],[[0,0,368,513,1455]],[[0,0,368,513,1458]],[[0,0,368,513,1461]],[[0,0,368,513,1464]],[[0,0,368,513,1467]],[[0,0,368,513,1470]],[[0,0,368,513,1473]],[[0,0,368,513,1476]],[[0,0,368,513,1479]],[[0,0,368,513,1482]],[[0,0,368,513,1485]],[[0,0,368,513,1488]],[[0,0,368,513,1491]],[[0,0,368,513,1494]],[[0,0,368,513,1497]],[[0,0,368,513,1500]],[[0,0,368,513,1503]],[[0,0,368,513,1506]],[[0,0,368,513,1509]],[[0,0,368,513,1512]],[[0,0,368,513,1515]],[[0,0,368,513,1518]],[[0,0,368,513,1521]],[[0,0,368,513,1524]],[[0,0,368,513,1527]],[[0,0,368,513,1530]],[[0,0,368,513,1533]],[[0,0,368,513,1536]],[[0,0,368,513,1539]],[[0,0,368,513,1542]],[[0,0,368,513,1545]],[[0,0,368,513,1548]],[[0,0,368,513,1551]],[[0,0,368,513,1554]],[[0,0,368,513,1557]],[[0,0,368,513,1560]],[[0,0,368,513,1563]],[[0,0,368,513,1566]],[[0,0,368,513,1569]],[[0,0,368,513,1572]],[[0,0,368,513,1575]],[[0,0,368,513,1578]],[[0,0,368,513,1581]],[[0,0,368,513,1584]],[[0,0,368,513,1587]],[[0,0,368,513,1590]],[[0,0,368,513,1593]],[[0,0,368,513,1596]],[[0,0,368,513,1599]],[[0,0,368,513,1602]],[[0,0,368,513,1605]],[[0,0,368,513,1608]],[[0,0,368,513,1611]],[[0,0,368,513,1614]],[[0,0,368,513,1617]],[[0,0,368,513,1620]],[[0,0,368,513,1623]],[[0,0,368,513,1626]],[[0,0,368,513,1629]],[[0,0,368,513,1632]],[[0,0,368,513,1635]],[[0,0,368,513,1638]],[[0,0,368,513,1641]],[[0,0,368,513,1644]],[[0,0,368,513,1647]],[[0,0,368,513,1650]],[[0,0,368,513,1653]],[[0,0,368,513,1656]],[[0,0,368,513,1659]],[[0,0,368,513,1662]],[[0,0,368,513,1665]],[[0,0,368,513,1668]],[[0,0,368,513,1671]],[[0,0,368,513,1674]],[[0,0,368,513,1677]],[[0,0,368,513,1680]],[[0,0,368,513,1683]],[[0,0,368,513,1686]],[[0,0,368,513,1689]],[[0,0,368,513,1692]],[[0,0,368,513,1695]],[[0,0,368,513,1698]],[[0,0,368,513,1701]],[[0,0,368,513,1704]],[[0,0,368,513,1707]],[[0,0,368,513,1710]],[[0,0,368,513,1713]],[[0,0,368,513,1716]],[[0,0,368,513,1719]],[[0,0,368,513,1722]],[[0,0,368,513,1725]],[[0,0,368,513,1728]],[[0,0,368,513,1731]],[[0,0,368,513,1734]],[[0,0,368,513,1737]],[[0,0,368,513,1740]],[[0,0,368,513,1743]],[[0,0,368,513,1746]],[[0,0,368,513,1749]],[[0,0,368,513,1752]],[[0,0,368,513,1755]],[[0,0,368,513,1758]],[[0,0,368,513,1761]],[[0,0,368,513,1764]],[[0,0,368,513,1767]],[[0,0,368,513,1770]],[[0,0,368,513,1773]],[[0,0,368,513,1776]],[[0,0,368,513,1779]],[[0,0,368,513,1782]],[[0,0,368,513,1785]],[[0,0,368,513,1788]],[[0,0,368,513,1791]],[[0,0,368,513,1794]],[[0,0,368,513,1797]],[[0,0,368,513,1800]],[[0,0,368,513,1803]],[[0,0,368,513,1806]],[[0,0,368,513,1809]],[[0,0,368,513,1812]],[[0,0,368,513,1815]],[[0,0,368,513,1818]],[[0,0,368,513,1821]],[[0,0,368,513,1824]],[[0,0,368,513,1827]],[[0,0,368,513,1830]],[[0,0,368,513,1833]],[[0,0,368,513,1836]],[[0,0,368,513,1839]],[[0,0,368,513,1842]],[[0,0,368,513,1845]],[[0,0,368,513,1848]],[[0,0,368,513,1851]],[[0,0,368,513,1854]],[[0,0,368,513,1857]],[[0,0,368,513,1860]],[[0,0,368,513,1863]],[[0,0,368,513,1866]],[[0,0,368,513,1869]],[[0,0,368,513,1872]],[[0,0,368,513,1875]],[[0,0,368,513,1878]],[[0,0,368,513,1881]],[[0,0,368,513,1884]],[[0,0,368,513,1887]],[[0,0,368,513,1890]],[[0,0,368,513,1893]],[[0,0,368,513,1896]],[[0,0,368,513,1899]],[[0,0,368,513,1902]],[[0,0,368,513,1905]],[[0,0,368,513,1908]],[[0,0,368,513,1911]],[[0,0,368,513,1914]],[[0,0,368,513,1917]],[[0,0,368,513,1920]],[[0,0,368,513,1923]],[[0,0,368,513,1926]],[[0,0,368,513,1929]],[[0,0,368,513,1932]],[[0,0,368,513,1935]],[[0,0,368,513,1938]],[[0,0,368,513,1941]],[[0,0,368,513,1944]],[[0,0,368,513,1947]],[[0,0,368,513,1950]],[[0,0,368,513,1953]],[[0,0,368,513,1956]],[[0,0,368,513,1959]],[[0,0,368,513,1962]],[[0,0,368,513,1965]],[[0,0,368,513,1968]],[[0,0,368,513,1971]],[[0,0,368,513,1974]],[[0,0,368,513,1977]],[[0,0,368,513,1980]],[[0,0,368,513,1983]],[[0,0,368,513,1986]],[[0,0,368,513,1989]],[[0,0,368,513,1992]],[[0,0,368,513,1995]],[[0,0,368,513,1998]],[[0,0,368,513,2001]],[[0,0,368,513,2004]],[[0,0,368,513,2007]],[[0,0,368,513,2010]],[[0,0,368,513,2013]],[[0,0,368,513,2016]],[[0,0,368,513,2019]],[[0,0,368,513,2022]],[[0,0,368,513,2025]],[[0,0,368,513,2028]],[[0,0,368,513,2031]],[[0,0,368,513,2034]],[[0,0,368,513,2037]],[[0,0,368,513,2040]],[[0,0,368,513,2043]],[[0,0,368,513,2046]],[[0,0,368,513,2049]],[[0,0,368,513,2052]],[[0,0,368,513,2055]],[[0,0,368,513,2058]],[[0,0,368,513,2061]],[[0,0,368,513,2064]],[[0,0,368,513,2067]],[[0,0,368,513,2070]],[[0,0,368,513,2073]],[[0,0,368,513,2076]],[[0,0,368,513,2079]],[[0,0,368,513,2082]],[[0,0,368,513,2085]],[[0,0,368,513,2088]],[[0,0,368,513,2091]],[[0,0,368,513,2094]],[[0,0,368,513,2097]],[[0,0,368,513,2100]],[[0,0,368,513,2103]],[[0,0,368,513,2106]],[[0,0,368,513,2109]],[[0,0,368,513,2112]],[[0,0,368,513,2115]],[[0,0,368,513,2118]],[[0,0,368,513,2121]],[[0,0,368,513,2124]],[[0,0,368,513,2127]],[[0,0,368,513,2130]],[[0,0,368,513,2133]],[[0,0,368,513,2136]],[[0,0,368,513,2139]],[[0,0,368,513,2142]],[[0,0,368,513,2145]],[[0,0,368,513,2148]],[[0,0,368,513,2151]],[[0,0,368,513,2154]],[[0,0,368,513,2157]],[[0,0,368,513,2160]],[[0,0,368,513,2163]],[[0,0,368,513,2166]],[[0,0,368,513,2169]],[[0,0,368,513,2172]],[[0,0,368,513,2175]],[[0,0,368,513,2178]],[[0,0,368,513,2181]],[[0,0,368,513,2184]],[[0,0,368,513,2187]],[[0,0,368,513,2190]],[[0,0,368,513,2193]],[[0,0,368,513,2196]],[[0,0,368,513,2199]],[[0,0,368,513,2202]],[[0,0,368,513,2205]],[[0,0,368,513,2208]],[[0,0,368,513,2211]],[[0,0,368,513,2214]],[[0,0,368,513,2217]],[[0,0,368,513,2220]],[[0,0,368,513,2223]],[[0,0,368,513,2226]],[[0,0,368,513,2229]],[[0,0,368,513,2232]],[[0,0,368,513,2235]],[[0,0,368,513,2238]],[[0,0,368,513,2241]],[[0,0,368,513,2244]],[[0,0,368,513,2247]],[[0,0,368,513,2250]],[[0,0,368,513,2253]],[[0,0,368,513,2256]],[[0,0,368,513,2259]],[[0,0,368,513,2262]],[[0,0,368,513,2265]],[[0,0,368,513,2268]],[[0,0,368,513,2271]],[[0,0,368,513,2274]],[[0,0,368,513,2277]],[[0,0,368,513,2280]],[[0,0,368,513,2283]],[[0,0,368,513,2286]],[[0,0,368,513,2289]],[[0,0,368,513,2292]],[[0,0,368,513,2295]],[[0,0,368,513,2298]],[[0,0,368,513,2301]],[[0,0,368,513,2304]],[[0,0,368,513,2307]],[[0,0,368,513,2310]],[[0,0,368,513,2313]],[[0,0,368,513,2316]],[[0,0,368,513,2319]],[[0,0,368,513,2322]],[[0,0,368,513,2325]],[[0,0,368,513,2328]],[[0,0,368,513,2331]],[[0,0,368,513,2334]],[[0,0,368,513,2337]],[[0,0,368,513,2340]],[[0,0,368,513,2343]],[[0,0,368,513,2346]],[[0,0,368,513,2349]],[[0,0,368,513,2352]],[[0,0,368,513,2355]],[[0,0,368,513,2358]],[[0,0,368,513,2361]],[[0,0,368,513,2364]],[[0,0,368,513,2367]],[[0,0,368,513,2370]],[[0,0,368,513,2373]],[[0,0,368,513,2376]],[[0,0,368,513,2379]],[[0,0,368,513,2382]],[[0,0,368,513,2385]],[[0,0,368,513,2388]],[[0,0,368,513,2391]],[[0,0,368,513,2394]],[[0,0,368,513,2397]],[[0,0,368,513,2400]],[[0,0,368,513,2403]],[[0,0,368,513,2406]],[[0,0,368,513,2409]],[[0,0,368,513,2412]],[[0,0,368,513,2415]],[[0,0,368,513,2418]],[[0,0,368,513,2421]],[[0,0,368,513,2424]],[[0,0,368,513,2427]],[[0,0,368,513,2430]],[[0,0,368,513,2433]],[[0,0,368,513,2436]],[[0,0,368,513,2439]],[[0,0,368,513,2442]],[[0,0,368,513,2445]],[[0,0,368,513,2448]],[[0,0,368,513,2451]],[[0,0,368,513,2454]],[[0,0,368,513,2457]],[[0,0,368,513,2460]],[[0,0,368,513,2463]],[[0,0,368,513,2466]],[[0,0,368,513,2469]],[[0,0,368,513,2472]],[[0,0,368,513,2475]],[[0,0,368,513,2478]],[[0,0,368,513,2481]],[[0,0,368,513,2484]],[[0,0,368,513,2487]],[[0,0,368,513,2490]],[[0,0,368,513,2493]],[[0,0,368,513,2496]],[[0,0,368,513,2499]],[[0,0,368,513,2502]],[[0,0,368,513,2505]],[[0,0,368,513,2508]],[[0,0,368,513,2511]],[[0,0,368,513,2514]],[[0,0,368,513,2517]],[[0,0,368,513,2520]],[[0,0,368,513,2523]],[[0,0,368,513,2526]],[[0,0,368,513,2529]],[[0,0,368,513,2532]],[[0,0,368,513,2535]],[[0,0,368,513,2538]],[[0,0,368,513,2541]],[[0,0,368,513,2544]],[[0,0,368,513,2547]],[[0,0,368,513,2550]],[[0,0,368,513,2553]],[[0,0,368,513,2556]],[[0,0,368,513,2559]],[[0,0,368,513,2562]],[[0,0,368,513,2565]],[[0,0,368,513,2568]],[[0,0,368,513,2571]],[[0,0,368,513,2574]],[[0,0,368,513,2577]],[[0,0,368,513,2580]],[[0,0,368,513,2583]],[[0,0,368,513,2586]],[[0,0,368,513,2589]],[[0,0,368,513,2592]],[[0,0,368,513,2595]],[[0,0,368,513,2598]],[[0,0,368,513,2601]],[[0,0,368,513,2604]],[[0,0,368,513,2607]],[[0,0,368,513,2610]],[[0,0,368,513,2613]],[[0,0,368,513,2616]],[[0,0,368,513,2619]],[[0,0,368,513,2622]],[[0,0,368,513,2625]],[[0,0,368,513,2628]],[[0,0,368,513,2631]],[[0,0,368,513,2634]],[[0,0,368,513,2637]],[[0,0,368,513,2640]],[[0,0,368,513,2643]],[[0,0,368,513,2646]],[[0,0,368,513,2649]],[[0,0,368,513,2652]],[[0,0,368,513,2655]],[[0,0,368,513,2658]],[[0,0,368,513,2661]],[[0,0,368,513,2664]],[[0,0,368,513,2667]],[[0,0,368,513,2670]],[[0,0,368,513,2673]],[[0,0,368,513,2676]],[[0,0,368,513,2679]],[[0,0,368,513,2682]],[[0,0,368,513,2685]],[[0,0,368,513,2688]],[[0,0,368,513,2691]],[[0,0,368,513,2694]],[[0,0,368,513,2697]],[[0,0,368,513,2700]],[[0,0,368,513,2703]],[[0,0,368,513,2706]],[[0,0,368,513,2709]],[[0,0,368,513,2712]],[[0,0,368,513,2715]],[[0,0,368,513,2718]],[[0,0,368,513,2721]],[[0,0,368,513,2724]],[[0,0,368,513,2727]],[[0,0,368,513,2730]],[[0,0,368,513,2733]],[[0,0,368,513,2736]],[[0,0,368,513,2739]],[[0,0,368,513,2742]],[[0,0,368,513,2745]],[[0,0,368,513,2748]],[[0,0,368,513,2751]],[[0,0,368,513,2754]],[[0,0,368,513,2757]],[[0,0,368,513,2760]],[[0,0,368,513,2763]],[[0,0,368,513,2766]],[[0,0,368,513,2769]],[[0,0,368,513,2772]],[[0,0,368,513,2775]],[[0,0,368,513,2778]],[[0,0,368,513,2781]],[[0,0,368,513,2784]],[[0,0,368,513,2787]],[[0,0,368,513,2790]],[[0,0,368,513,2793]],[[0,0,368,513,2796]]],"text_len_per_page":[53,53,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54,54],"metadata":{"format":"PDF 1.6","title":"","author":"","subject":"","keywords":"","creator":"Adobe Acrobat 7.0","producer":"Adobe Acrobat 7.0 Image Conversion Plug-in","creationDate":"D:20080404141457+01'00'","modDate":"D:20080404144821+01'00'","trapped":"","encryption":null}}
+    # o = json.loads(json.dumps(o))
+    # total_page = o["total_page"]
+    # page_width = o["page_width_pts"]
+    # page_height = o["page_height_pts"]
+    # img_sz_list = o["image_info_per_page"]
+    # text_len_list = o['text_len_per_page']
+    # pdf_path = o['pdf_path']
+    # is_encrypted = o['is_encrypted']
+    # is_needs_password = o['is_needs_password']
+    # if is_encrypted or total_page == 0 or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+    #     print("加密的")
+    #     exit(0)
+    # tag = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list)
+    # o['is_text_pdf'] = tag
+    # print(json.dumps(o, ensure_ascii=False))
diff --git a/magic_pdf/filter/pdf_meta_scan.py b/magic_pdf/filter/pdf_meta_scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..89d44878d313a1a72f3bb311c812195e546d2a76
--- /dev/null
+++ b/magic_pdf/filter/pdf_meta_scan.py
@@ -0,0 +1,388 @@
+"""
+输入： s3路径，每行一个
+输出： pdf文件元信息，包括每一页上的所有图片的长宽高，bbox位置
+"""
+import sys
+import click
+
+from magic_pdf.libs.commons import read_file, mymax, get_top_percent_list
+from magic_pdf.libs.commons import fitz
+from loguru import logger
+from collections import Counter
+
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.language import detect_lang
+from magic_pdf.libs.pdf_check import detect_invalid_chars
+
+scan_max_page = 50
+junk_limit_min = 10
+
+
+def calculate_max_image_area_per_page(result: list, page_width_pts, page_height_pts):
+    max_image_area_per_page = [mymax([(x1 - x0) * (y1 - y0) for x0, y0, x1, y1, _ in page_img_sz]) for page_img_sz in
+                               result]
+    page_area = int(page_width_pts) * int(page_height_pts)
+    max_image_area_per_page = [area / page_area for area in max_image_area_per_page]
+    max_image_area_per_page = [area for area in max_image_area_per_page if area > 0.6]
+    return max_image_area_per_page
+
+
+def process_image(page, junk_img_bojids=[]):
+    page_result = []  # 存每个页面里的多张图四元组信息
+    items = page.get_images()
+    dedup = set()
+    for img in items:
+        # 这里返回的是图片在page上的实际展示的大小。返回一个数组，每个元素第一部分是
+        img_bojid = img[0]  # 在pdf文件中是全局唯一的，如果这个图反复出现在pdf里那么就可能是垃圾信息，例如水印、页眉页脚等
+        if img_bojid in junk_img_bojids:  # 如果是垃圾图像，就跳过
+            continue
+        recs = page.get_image_rects(img, transform=True)
+        if recs:
+            rec = recs[0][0]
+            x0, y0, x1, y1 = map(int, rec)
+            width = x1 - x0
+            height = y1 - y0
+            if (x0, y0, x1, y1, img_bojid) in dedup:  # 这里面会出现一些重复的bbox，无需重复出现，需要去掉
+                continue
+            if not all([width, height]):  # 长和宽任何一个都不能是0，否则这个图片不可见，没有实际意义
+                continue
+            dedup.add((x0, y0, x1, y1, img_bojid))
+            page_result.append([x0, y0, x1, y1, img_bojid])
+    return page_result
+
+
+def get_image_info(doc: fitz.Document, page_width_pts, page_height_pts) -> list:
+    """
+    返回每个页面里的图片的四元组，每个页面多个图片。
+    :param doc:
+    :return:
+    """
+    # 使用 Counter 计数 img_bojid 的出现次数
+    img_bojid_counter = Counter(img[0] for page in doc for img in page.get_images())
+    # 找出出现次数超过 len(doc) 半数的 img_bojid
+
+    junk_limit = max(len(doc) * 0.5, junk_limit_min)  # 对一些页数比较少的进行豁免
+
+    junk_img_bojids = [img_bojid for img_bojid, count in img_bojid_counter.items() if count >= junk_limit]
+
+    #todo 加个判断，用前十页就行，这些垃圾图片需要满足两个条件，不止出现的次数要足够多，而且图片占书页面积的比例要足够大，且图与图大小都差不多
+    #有两种扫描版，一种文字版，这里可能会有误判
+    #扫描版1：每页都有所有扫描页图片，特点是图占比大，每页展示1张
+    #扫描版2，每页存储的扫描页图片数量递增，特点是图占比大，每页展示1张，需要清空junklist跑前50页图片信息用于分类判断
+    #文字版1.每页存储所有图片，特点是图片占页面比例不大，每页展示可能为0也可能不止1张 这种pdf需要拿前10页抽样检测img大小和个数，如果符合需要清空junklist
+    imgs_len_list = [len(page.get_images()) for page in doc]
+
+    special_limit_pages = 10
+
+    # 统一用前十页结果做判断
+    result = []
+    break_loop = False
+    for i, page in enumerate(doc):
+        if break_loop:
+            break
+        if i >= special_limit_pages:
+            break
+        page_result = process_image(page)  # 这里不传junk_img_bojids，拿前十页所有图片信息用于后续分析
+        result.append(page_result)
+        for item in result:
+            if not any(item):  # 如果任何一页没有图片，说明是个文字版，需要判断是否为特殊文字版
+                if max(imgs_len_list) == min(imgs_len_list) and max(
+                        imgs_len_list) >= junk_limit_min:  # 如果是特殊文字版，就把junklist置空并break
+                    junk_img_bojids = []
+                else:  # 不是特殊文字版，是个普通文字版，但是存在垃圾图片，不置空junklist
+                    pass
+                break_loop = True
+                break
+    if not break_loop:
+        # 获取前80%的元素
+        top_eighty_percent = get_top_percent_list(imgs_len_list, 0.8)
+        # 检查前80%的元素是否都相等
+        if len(set(top_eighty_percent)) == 1 and max(imgs_len_list) >= junk_limit_min:
+
+            # # 如果前10页跑完都有图，根据每页图片数量是否相等判断是否需要清除junklist
+            # if max(imgs_len_list) == min(imgs_len_list) and max(imgs_len_list) >= junk_limit_min:
+
+            #前10页都有图，且每页数量一致，需要检测图片大小占页面的比例判断是否需要清除junklist
+            max_image_area_per_page = calculate_max_image_area_per_page(result, page_width_pts, page_height_pts)
+            if len(max_image_area_per_page) < 0.8 * special_limit_pages:  # 前10页不全是大图，说明可能是个文字版pdf，把垃圾图片list置空
+                junk_img_bojids = []
+            else:  # 前10页都有图，而且80%都是大图，且每页图片数量一致并都很多，说明是扫描版1，不需要清空junklist
+                pass
+        else:  # 每页图片数量不一致，需要清掉junklist全量跑前50页图片
+            junk_img_bojids = []
+
+    #正式进入取前50页图片的信息流程
+    result = []
+    for i, page in enumerate(doc):
+        if i >= scan_max_page:
+            break
+        page_result = process_image(page, junk_img_bojids)
+        # logger.info(f"page {i} img_len: {len(page_result)}")
+        result.append(page_result)
+
+    return result, junk_img_bojids
+
+
+def get_pdf_page_size_pts(doc: fitz.Document):
+    page_cnt = len(doc)
+    l: int = min(page_cnt, 50)
+    #把所有宽度和高度塞到两个list 分别取中位数（中间遇到了个在纵页里塞横页的pdf，导致宽高互换了）
+    page_width_list = []
+    page_height_list = []
+    for i in range(l):
+        page = doc[i]
+        page_rect = page.rect
+        page_width_list.append(page_rect.width)
+        page_height_list.append(page_rect.height)
+
+    page_width_list.sort()
+    page_height_list.sort()
+
+    median_width = page_width_list[len(page_width_list) // 2]
+    median_height = page_height_list[len(page_height_list) // 2]
+
+    return median_width, median_height
+
+
+def get_pdf_textlen_per_page(doc: fitz.Document):
+    text_len_lst = []
+    for page in doc:
+        # 拿包含img和text的所有blocks
+        # text_block = page.get_text("blocks")
+        # 拿所有text的blocks
+        # text_block = page.get_text("words")
+        # text_block_len = sum([len(t[4]) for t in text_block])
+        #拿所有text的str
+        text_block = page.get_text("text")
+        text_block_len = len(text_block)
+        # logger.info(f"page {page.number} text_block_len: {text_block_len}")
+        text_len_lst.append(text_block_len)
+
+    return text_len_lst
+
+
+def get_pdf_text_layout_per_page(doc: fitz.Document):
+    """
+    根据PDF文档的每一页文本布局，判断该页的文本布局是横向、纵向还是未知。
+
+    Args:
+        doc (fitz.Document): PDF文档对象。
+
+    Returns:
+        List[str]: 每一页的文本布局（横向、纵向、未知）。
+
+    """
+    text_layout_list = []
+
+    for page_id, page in enumerate(doc):
+        if page_id >= scan_max_page:
+            break
+        # 创建每一页的纵向和横向的文本行数计数器
+        vertical_count = 0
+        horizontal_count = 0
+        text_dict = page.get_text("dict")
+        if "blocks" in text_dict:
+            for block in text_dict["blocks"]:
+                if 'lines' in block:
+                    for line in block["lines"]:
+                        # 获取line的bbox顶点坐标
+                        x0, y0, x1, y1 = line['bbox']
+                        # 计算bbox的宽高
+                        width = x1 - x0
+                        height = y1 - y0
+                        # 计算bbox的面积
+                        area = width * height
+                        font_sizes = []
+                        for span in line['spans']:
+                            if 'size' in span:
+                                font_sizes.append(span['size'])
+                        if len(font_sizes) > 0:
+                            average_font_size = sum(font_sizes) / len(font_sizes)
+                        else:
+                            average_font_size = 10  # 有的line拿不到font_size，先定一个阈值100
+                        if area <= average_font_size ** 2:  # 判断bbox的面积是否小于平均字体大小的平方,单字无法计算是横向还是纵向
+                            continue
+                        else:
+                            if 'wmode' in line:  # 通过wmode判断文本方向
+                                if line['wmode'] == 1:  # 判断是否为竖向文本
+                                    vertical_count += 1
+                                elif line['wmode'] == 0:  # 判断是否为横向文本
+                                    horizontal_count += 1
+                        #     if 'dir' in line:  # 通过旋转角度计算判断文本方向
+                        #         # 获取行的 "dir" 值
+                        #         dir_value = line['dir']
+                        #         cosine, sine = dir_value
+                        #         # 计算角度
+                        #         angle = math.degrees(math.acos(cosine))
+                        #
+                        #         # 判断是否为横向文本
+                        #         if abs(angle - 0) < 0.01 or abs(angle - 180) < 0.01:
+                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
+                        #             # print('This line is horizontal:', line_text)
+                        #             horizontal_count += 1
+                        #         # 判断是否为纵向文本
+                        #         elif abs(angle - 90) < 0.01 or abs(angle - 270) < 0.01:
+                        #             # line_text = ' '.join(span['text'] for span in line['spans'])
+                        #             # print('This line is vertical:', line_text)
+                        #             vertical_count += 1
+        # print(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
+        # 判断每一页的文本布局
+        if vertical_count == 0 and horizontal_count == 0:  # 该页没有文本，无法判断
+            text_layout_list.append("unknow")
+            continue
+        else:
+            if vertical_count > horizontal_count:  # 该页的文本纵向行数大于横向的
+                text_layout_list.append("vertical")
+            else:  # 该页的文本横向行数大于纵向的
+                text_layout_list.append("horizontal")
+        # logger.info(f"page_id: {page_id}, vertical_count: {vertical_count}, horizontal_count: {horizontal_count}")
+    return text_layout_list
+
+
+'''定义一个自定义异常用来抛出单页svg太多的pdf'''
+
+
+class PageSvgsTooManyError(Exception):
+    def __init__(self, message="Page SVGs are too many"):
+        self.message = message
+        super().__init__(self.message)
+
+
+def get_svgs_per_page(doc: fitz.Document):
+    svgs_len_list = []
+    for page_id, page in enumerate(doc):
+        # svgs = page.get_drawings()
+        svgs = page.get_cdrawings()  # 切换成get_cdrawings，效率更高
+        len_svgs = len(svgs)
+        if len_svgs >= 3000:
+            raise PageSvgsTooManyError()
+        else:
+            svgs_len_list.append(len_svgs)
+        # logger.info(f"page_id: {page_id}, svgs_len: {len(svgs)}")
+    return svgs_len_list
+
+
+def get_imgs_per_page(doc: fitz.Document):
+    imgs_len_list = []
+    for page_id, page in enumerate(doc):
+        imgs = page.get_images()
+        imgs_len_list.append(len(imgs))
+        # logger.info(f"page_id: {page}, imgs_len: {len(imgs)}")
+
+    return imgs_len_list
+
+
+def get_language(doc: fitz.Document):
+    """
+    获取PDF文档的语言。
+    Args:
+        doc (fitz.Document): PDF文档对象。
+    Returns:
+        str: 文档语言，如 "en-US"。
+    """
+    language_lst = []
+    for page_id, page in enumerate(doc):
+        if page_id >= scan_max_page:
+            break
+        # 拿所有text的str
+        text_block = page.get_text("text")
+        page_language = detect_lang(text_block)
+        language_lst.append(page_language)
+
+        # logger.info(f"page_id: {page_id}, page_language: {page_language}")
+
+    # 统计text_language_list中每种语言的个数
+    count_dict = Counter(language_lst)
+    # 输出text_language_list中出现的次数最多的语言
+    language = max(count_dict, key=count_dict.get)
+    return language
+
+
+def check_invalid_chars(pdf_bytes):
+    """
+    乱码检测
+    """
+    return detect_invalid_chars(pdf_bytes)
+
+
+def pdf_meta_scan(pdf_bytes: bytes):
+    """
+    :param s3_pdf_path:
+    :param pdf_bytes: pdf文件的二进制数据
+    几个维度来评价：是否加密，是否需要密码，纸张大小，总页数，是否文字可提取
+    """
+    doc = fitz.open("pdf", pdf_bytes)
+    is_needs_password = doc.needs_pass
+    is_encrypted = doc.is_encrypted
+    total_page = len(doc)
+    if total_page == 0:
+        logger.warning(f"drop this pdf, drop_reason: {DropReason.EMPTY_PDF}")
+        result = {"_need_drop": True, "_drop_reason": DropReason.EMPTY_PDF}
+        return result
+    else:
+        page_width_pts, page_height_pts = get_pdf_page_size_pts(doc)
+        # logger.info(f"page_width_pts: {page_width_pts}, page_height_pts: {page_height_pts}")
+
+        # svgs_per_page = get_svgs_per_page(doc)
+        # logger.info(f"svgs_per_page: {svgs_per_page}")
+        imgs_per_page = get_imgs_per_page(doc)
+        # logger.info(f"imgs_per_page: {imgs_per_page}")
+
+        image_info_per_page, junk_img_bojids = get_image_info(doc, page_width_pts, page_height_pts)
+        # logger.info(f"image_info_per_page: {image_info_per_page}, junk_img_bojids: {junk_img_bojids}")
+        text_len_per_page = get_pdf_textlen_per_page(doc)
+        # logger.info(f"text_len_per_page: {text_len_per_page}")
+        text_layout_per_page = get_pdf_text_layout_per_page(doc)
+        # logger.info(f"text_layout_per_page: {text_layout_per_page}")
+        text_language = get_language(doc)
+        # logger.info(f"text_language: {text_language}")
+        invalid_chars = check_invalid_chars(pdf_bytes)
+        # logger.info(f"invalid_chars: {invalid_chars}")
+
+        # 最后输出一条json
+        res = {
+            "is_needs_password": is_needs_password,
+            "is_encrypted": is_encrypted,
+            "total_page": total_page,
+            "page_width_pts": int(page_width_pts),
+            "page_height_pts": int(page_height_pts),
+            "image_info_per_page": image_info_per_page,
+            "text_len_per_page": text_len_per_page,
+            "text_layout_per_page": text_layout_per_page,
+            "text_language": text_language,
+            # "svgs_per_page": svgs_per_page,
+            "imgs_per_page": imgs_per_page,  # 增加每页img数量list
+            "junk_img_bojids": junk_img_bojids,  # 增加垃圾图片的bojid list
+            "invalid_chars": invalid_chars,
+            "metadata": doc.metadata
+        }
+        # logger.info(json.dumps(res, ensure_ascii=False))
+        return res
+
+
+@click.command()
+@click.option('--s3-pdf-path', help='s3上pdf文件的路径')
+@click.option('--s3-profile', help='s3上的profile')
+def main(s3_pdf_path: str, s3_profile: str):
+    """
+
+    """
+    try:
+        file_content = read_file(s3_pdf_path, s3_profile)
+        pdf_meta_scan(file_content)
+    except Exception as e:
+        print(f"ERROR: {s3_pdf_path}, {e}", file=sys.stderr)
+        logger.exception(e)
+
+
+if __name__ == '__main__':
+    main()
+    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师-大乘无量寿.pdf"
+    # "D:\project/20231108code-clean\pdf_cost_time\竖排例子\三国演义_繁体竖排版.pdf"
+    # "D:\project/20231108code-clean\pdf_cost_time\scihub\scihub_86800000\libgen.scimag86880000-86880999.zip_10.1021/acsami.1c03109.s002.pdf"
+    # "D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_18600000/libgen.scimag18645000-18645999.zip_10.1021/om3006239.pdf"
+    # file_content = read_file("D:/project/20231108code-clean/pdf_cost_time/scihub/scihub_31000000/libgen.scimag31098000-31098999.zip_10.1109/isit.2006.261791.pdf","")
+    # file_content = read_file("D:\project/20231108code-clean\pdf_cost_time\竖排例子\净空法师_大乘无量寿.pdf","")
+    # doc = fitz.open("pdf", file_content)
+    # text_layout_lst = get_pdf_text_layout_per_page(doc)
+    # print(text_layout_lst)
diff --git a/magic_pdf/layout/__init__.py b/magic_pdf/layout/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/layout/bbox_sort.py b/magic_pdf/layout/bbox_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e1508ff5c3dc41a44035af494a486ad605c1cad
--- /dev/null
+++ b/magic_pdf/layout/bbox_sort.py
@@ -0,0 +1,681 @@
+# 定义这里的bbox是一个list [x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None
+# 其中x0, y0代表左上角坐标，x1, y1代表右下角坐标，坐标原点在左上角。
+
+
+
+from magic_pdf.layout.layout_spiler_recog import get_spilter_of_page
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_vertical_full_overlap
+from magic_pdf.libs.commons import mymax
+
+X0_IDX = 0
+Y0_IDX = 1
+X1_IDX = 2
+Y1_IDX = 3
+CONTENT_IDX = 4
+IDX_X = 5
+IDX_Y = 6
+CONTENT_TYPE_IDX = 7
+
+X0_EXT_IDX = 8
+Y0_EXT_IDX = 9
+X1_EXT_IDX = 10
+Y1_EXT_IDX = 11
+
+
+def prepare_bboxes_for_layout_split(image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info, text_raw_blocks: dict, page_boundry, page):
+    """
+    text_raw_blocks:结构参考test/assets/papre/pymu_textblocks.json
+    把bbox重新组装成一个list，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是pymupdf里的block结构
+    """
+    all_bboxes = []
+    
+    for image in image_info:
+        box = image['bbox']
+        # 由于没有实现横向的栏切分，因此在这里先过滤掉一些小的图片。这些图片有可能影响layout，造成没有横向栏切分的情况下，layout切分不准确。例如 scihub_76500000/libgen.scimag76570000-76570999.zip_10.1186/s13287-019-1355-1
+        # 把长宽都小于50的去掉
+        if abs(box[0]-box[2]) < 50 and abs(box[1]-box[3]) < 50:
+            continue
+        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'image', None, None, None, None])
+        
+    for table in table_info:
+        box = table['bbox']
+        all_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'table', None, None, None, None])
+    
+    """由于公式与段落混合，因此公式不再参与layout划分，无需加入all_bboxes"""
+    # 加入文本block
+    text_block_temp = []
+    for block in text_raw_blocks:
+        bbox = block['bbox']
+        text_block_temp.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None])
+        
+    text_block_new = resolve_bbox_overlap_for_layout_det(text_block_temp)   
+    text_block_new = filter_lines_bbox(text_block_new) # 去掉线条bbox，有可能让layout探测陷入无限循环
+    
+        
+    """找出会影响layout的色块、横向分割线"""
+    spilter_bboxes = get_spilter_of_page(page, [b['bbox'] for b in image_info]+[b['bbox'] for b in image_backup_info], [b['bbox'] for b in table_info], )
+    # 还要去掉存在于spilter_bboxes里的text_block
+    if len(spilter_bboxes) > 0:
+        text_block_new = [box for box in text_block_new if not any([_is_in_or_part_overlap(box[:4], spilter_bbox) for spilter_bbox in spilter_bboxes])]
+        
+    for bbox in text_block_new:
+        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'text', None, None, None, None]) 
+        
+    for bbox in spilter_bboxes:
+        all_bboxes.append([bbox[0], bbox[1], bbox[2], bbox[3], None, None, None, 'spilter', None, None, None, None])
+    
+     
+    return all_bboxes
+
+def resolve_bbox_overlap_for_layout_det(bboxes:list):
+    """
+    1. 去掉bbox互相包含的，去掉被包含的
+    2. 上下方向上如果有重叠，就扩大大box范围，直到覆盖小box
+    """
+    def _is_in_other_bbox(i:int):
+        """
+        判断i个box是否被其他box有所包含
+        """
+        for j in range(0, len(bboxes)):
+            if j!=i and _is_in(bboxes[i][:4], bboxes[j][:4]):
+                return True
+            # elif j!=i and _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
+            #     return True
+            
+        return False
+    
+    # 首先去掉被包含的bbox
+    new_bbox_1 = []
+    for i in range(0, len(bboxes)):
+        if not _is_in_other_bbox(i):
+            new_bbox_1.append(bboxes[i])
+            
+    # 其次扩展大的box
+    new_box = []
+    new_bbox_2 = []
+    len_1 = len(new_bbox_2)
+    while True:
+        merged_idx = []
+        for i in range(0, len(new_bbox_1)):
+            if i in merged_idx:
+                continue
+            for j in range(i+1, len(new_bbox_1)):
+                if j in merged_idx:
+                    continue
+                bx1 = new_bbox_1[i]
+                bx2 = new_bbox_1[j]
+                if i!=j and _is_vertical_full_overlap(bx1[:4], bx2[:4]):
+                    merged_box = min([bx1[0], bx2[0]]), min([bx1[1], bx2[1]]), max([bx1[2], bx2[2]]), max([bx1[3], bx2[3]])
+                    new_bbox_2.append(merged_box)
+                    merged_idx.append(i)
+                    merged_idx.append(j)
+                    
+        for i in range(0, len(new_bbox_1)): # 没有合并的加入进来
+            if i not in merged_idx:
+                new_bbox_2.append(new_bbox_1[i])        
+
+        if len(new_bbox_2)==0 or len_1==len(new_bbox_2):
+            break
+        else:
+            len_1 = len(new_bbox_2)
+            new_box = new_bbox_2
+            new_bbox_1, new_bbox_2 = new_bbox_2, []
+                        
+    return new_box
+
+
+def filter_lines_bbox(bboxes: list):
+    """
+    过滤掉bbox为空的行
+    """
+    new_box = []
+    for box in bboxes:
+        x0, y0, x1, y1 = box[0], box[1], box[2], box[3]
+        if abs(x0-x1)<=1 or abs(y0-y1)<=1:
+            continue
+        else:
+            new_box.append(box)
+    return new_box
+
+
+################################################################################
+# 第一种排序算法
+# 以下是基于延长线遮挡做的一个算法
+#
+################################################################################
+def find_all_left_bbox(this_bbox, all_bboxes) -> list:
+    """
+    寻找this_bbox左边的所有bbox
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
+    return left_boxes
+
+
+def find_all_top_bbox(this_bbox, all_bboxes) -> list:
+    """
+    寻找this_bbox上面的所有bbox
+    """
+    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX]]
+    return top_boxes
+
+
+def get_and_set_idx_x(this_bbox, all_bboxes) -> int:
+    """
+    寻找this_bbox在all_bboxes中的遮挡深度 idx_x
+    """
+    if this_bbox[IDX_X] is not None:
+        return this_bbox[IDX_X]
+    else:
+        all_left_bboxes = find_all_left_bbox(this_bbox, all_bboxes)
+        if len(all_left_bboxes) == 0:
+            this_bbox[IDX_X] = 0
+        else:
+            all_left_bboxes_idx = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_left_bboxes]
+            max_idx_x = mymax(all_left_bboxes_idx)
+            this_bbox[IDX_X] = max_idx_x + 1
+        return this_bbox[IDX_X]
+
+
+def get_and_set_idx_y(this_bbox, all_bboxes) -> int:
+    """
+    寻找this_bbox在all_bboxes中y方向的遮挡深度 idx_y
+    """
+    if this_bbox[IDX_Y] is not None:
+        return this_bbox[IDX_Y]
+    else:
+        all_top_bboxes = find_all_top_bbox(this_bbox, all_bboxes)
+        if len(all_top_bboxes) == 0:
+            this_bbox[IDX_Y] = 0
+        else:
+            all_top_bboxes_idx = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_top_bboxes]
+            max_idx_y = mymax(all_top_bboxes_idx)
+            this_bbox[IDX_Y] = max_idx_y + 1
+        return this_bbox[IDX_Y]
+
+
+def bbox_sort(all_bboxes: list):
+    """
+    排序
+    """
+    all_bboxes_idx_x = [get_and_set_idx_x(bbox, all_bboxes) for bbox in all_bboxes]
+    all_bboxes_idx_y = [get_and_set_idx_y(bbox, all_bboxes) for bbox in all_bboxes]
+    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
+
+    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
+    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
+    all_bboxes_idx.sort(key=lambda x: x[0])
+    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
+    return sorted_bboxes
+
+
+################################################################################
+# 第二种排序算法
+# 下面的算法在计算idx_x和idx_y的时候不考虑延长线，而只考虑实际的长或者宽被遮挡的情况
+#
+################################################################################
+
+def find_left_nearest_bbox(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
+         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
+        
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
+    if len(left_boxes) > 0:
+        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
+        left_boxes = [left_boxes[0]]
+    else:
+        left_boxes = []
+    return left_boxes
+
+
+def get_and_set_idx_x_2(this_bbox, all_bboxes):
+    """
+    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
+    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
+    """
+    if this_bbox[IDX_X] is not None:
+        return this_bbox[IDX_X]
+    else:
+        left_nearest_bbox = find_left_nearest_bbox(this_bbox, all_bboxes)
+        if len(left_nearest_bbox) == 0:
+            this_bbox[IDX_X] = 0
+        else:
+            left_idx_x = get_and_set_idx_x_2(left_nearest_bbox[0], all_bboxes)
+            this_bbox[IDX_X] = left_idx_x + 1
+        return this_bbox[IDX_X]
+
+
+def find_top_nearest_bbox(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
+    """
+    top_boxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
+    if len(top_boxes) > 0:
+        top_boxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
+        top_boxes = [top_boxes[0]]
+    else:
+        top_boxes = []
+    return top_boxes
+
+
+def get_and_set_idx_y_2(this_bbox, all_bboxes):
+    """
+    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
+    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
+    """
+    if this_bbox[IDX_Y] is not None:
+        return this_bbox[IDX_Y]
+    else:
+        top_nearest_bbox = find_top_nearest_bbox(this_bbox, all_bboxes)
+        if len(top_nearest_bbox) == 0:
+            this_bbox[IDX_Y] = 0
+        else:
+            top_idx_y = get_and_set_idx_y_2(top_nearest_bbox[0], all_bboxes)
+            this_bbox[IDX_Y] = top_idx_y + 1
+        return this_bbox[IDX_Y]
+
+
+def paper_bbox_sort(all_bboxes: list, page_width, page_height):
+    all_bboxes_idx_x = [get_and_set_idx_x_2(bbox, all_bboxes) for bbox in all_bboxes]
+    all_bboxes_idx_y = [get_and_set_idx_y_2(bbox, all_bboxes) for bbox in all_bboxes]
+    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
+
+    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
+    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
+    all_bboxes_idx.sort(key=lambda x: x[0])
+    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
+    return sorted_bboxes
+
+################################################################################
+"""
+第三种排序算法, 假设page的最左侧为X0，最右侧为X1，最上侧为Y0，最下侧为Y1
+这个排序算法在第二种算法基础上增加对bbox的预处理步骤。预处理思路如下：
+1. 首先在水平方向上对bbox进行扩展。扩展方法是：
+    - 对每个bbox，找到其左边最近的bbox（也就是y方向有重叠），然后将其左边界扩展到左边最近bbox的右边界(x1+1),这里加1是为了避免重叠。如果没有左边的bbox，那么就将其左边界扩展到page的最左侧X0。
+    - 对每个bbox，找到其右边最近的bbox（也就是y方向有重叠），然后将其右边界扩展到右边最近bbox的左边界(x0-1),这里减1是为了避免重叠。如果没有右边的bbox，那么就将其右边界扩展到page的最右侧X1。
+    - 经过上面2个步骤，bbox扩展到了水平方向的最大范围。[左最近bbox.x1+1, 右最近bbox.x0-1]
+    
+2. 合并所有的连续水平方向的bbox, 合并方法是：
+    - 对bbox进行y方向排序，然后从上到下遍历所有bbox，如果当前bbox和下一个bbox的x0, x1等于X0, X1，那么就合并这两个bbox。
+    
+3. 然后在垂直方向上对bbox进行扩展。扩展方法是：
+    - 首先从page上切割掉合并后的水平bbox, 得到几个新的block
+    针对每个block
+    - x0: 扎到位于左侧x=x0延长线的左侧所有的bboxes, 找到最大的x1,让x0=x1+1。如果没有，则x0=X0
+    - x1: 找到位于右侧x=x1延长线右侧所有的bboxes， 找到最小的x0, 让x1=x0-1。如果没有，则x1=X1
+    随后在垂直方向上合并所有的连续的block，方法如下：
+    - 对block进行x方向排序，然后从左到右遍历所有block，如果当前block和下一个block的x0, x1相等，那么就合并这两个block。
+    如果垂直切分后所有小bbox都被分配到了一个block, 那么分割就完成了。这些合并后的block打上标签'GOOD_LAYOUT’
+    如果在某个垂直方向上无法被完全分割到一个block，那么就将这个block打上标签'BAD_LAYOUT'。
+    至此完成，一个页面的预处理，天然的block要么属于'GOOD_LAYOUT'，要么属于'BAD_LAYOUT'。针对含有'BAD_LAYOUT'的页面，可以先按照自上而下，自左到右进行天然排序，也可以先过滤掉这种书籍。
+    (完成条件下次加强：进行水平方向切分，把混乱的layout部分尽可能切割出去)
+"""
+################################################################################
+def find_left_neighbor_bboxes(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox
+    这里使用扩展之后的bbox
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_EXT_IDX] <= this_bbox[X0_EXT_IDX] and any([
+         box[Y0_EXT_IDX] < this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX], box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX] < box[Y1_EXT_IDX],
+         this_bbox[Y0_EXT_IDX] < box[Y0_EXT_IDX] < this_bbox[Y1_EXT_IDX], this_bbox[Y0_EXT_IDX] < box[Y1_EXT_IDX] < this_bbox[Y1_EXT_IDX],
+         box[Y0_EXT_IDX]==this_bbox[Y0_EXT_IDX] and box[Y1_EXT_IDX]==this_bbox[Y1_EXT_IDX]])]
+        
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
+    if len(left_boxes) > 0:
+        left_boxes.sort(key=lambda x: x[X1_EXT_IDX], reverse=True)
+        left_boxes = left_boxes
+    else:
+        left_boxes = []
+    return left_boxes
+
+def find_top_neighbor_bboxes(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有下侧宽度和this_bbox有重叠的bbox
+    这里使用扩展之后的bbox
+    """
+    top_boxes = [box for box in all_bboxes if box[Y1_EXT_IDX] <= this_bbox[Y0_EXT_IDX] and any([
+        box[X0_EXT_IDX] < this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX], box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX] < box[X1_EXT_IDX],
+         this_bbox[X0_EXT_IDX] < box[X0_EXT_IDX] < this_bbox[X1_EXT_IDX], this_bbox[X0_EXT_IDX] < box[X1_EXT_IDX] < this_bbox[X1_EXT_IDX],
+        box[X0_EXT_IDX]==this_bbox[X0_EXT_IDX] and box[X1_EXT_IDX]==this_bbox[X1_EXT_IDX]])]
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
+    if len(top_boxes) > 0:
+        top_boxes.sort(key=lambda x: x[Y1_EXT_IDX], reverse=True)
+        top_boxes = top_boxes
+    else:
+        top_boxes = []
+    return top_boxes
+
+def get_and_set_idx_x_2_ext(this_bbox, all_bboxes):
+    """
+    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_x
+    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
+    """
+    if this_bbox[IDX_X] is not None:
+        return this_bbox[IDX_X]
+    else:
+        left_nearest_bbox = find_left_neighbor_bboxes(this_bbox, all_bboxes)
+        if len(left_nearest_bbox) == 0:
+            this_bbox[IDX_X] = 0
+        else:
+            left_idx_x = [get_and_set_idx_x_2(b, all_bboxes) for b in left_nearest_bbox]
+            this_bbox[IDX_X] = mymax(left_idx_x) + 1
+        return this_bbox[IDX_X]
+   
+def get_and_set_idx_y_2_ext(this_bbox, all_bboxes):
+    """
+    寻找this_bbox在all_bboxes中的被直接遮挡的深度 idx_y
+    这个遮挡深度不考虑延长线，而是被实际的长或者宽遮挡的情况
+    """
+    if this_bbox[IDX_Y] is not None:
+        return this_bbox[IDX_Y]
+    else:
+        top_nearest_bbox = find_top_neighbor_bboxes(this_bbox, all_bboxes)
+        if len(top_nearest_bbox) == 0:
+            this_bbox[IDX_Y] = 0
+        else:
+            top_idx_y = [get_and_set_idx_y_2_ext(b, all_bboxes) for b in top_nearest_bbox]
+            this_bbox[IDX_Y] = mymax(top_idx_y) + 1
+        return this_bbox[IDX_Y]
+ 
+def _paper_bbox_sort_ext(all_bboxes: list):
+    all_bboxes_idx_x = [get_and_set_idx_x_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
+    all_bboxes_idx_y = [get_and_set_idx_y_2_ext(bbox, all_bboxes) for bbox in all_bboxes]
+    all_bboxes_idx = [(idx_x, idx_y) for idx_x, idx_y in zip(all_bboxes_idx_x, all_bboxes_idx_y)]
+
+    all_bboxes_idx = [idx_x_y[0] * 100000 + idx_x_y[1] for idx_x_y in all_bboxes_idx]  # 变换成一个点，保证能够先X，X相同时按Y排序
+    all_bboxes_idx = list(zip(all_bboxes_idx, all_bboxes))
+    all_bboxes_idx.sort(key=lambda x: x[0])
+    sorted_bboxes = [bbox for idx, bbox in all_bboxes_idx]
+    return sorted_bboxes
+
+# ===============================================================================================
+def find_left_bbox_ext_line(this_bbox, all_bboxes) -> list:
+    """
+    寻找this_bbox左边的所有bbox, 使用延长线
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX]]
+    if len(left_boxes):
+        left_boxes.sort(key=lambda x: x[X1_IDX], reverse=True)
+        left_boxes = left_boxes[0]
+    else:
+        left_boxes = None
+    
+    return left_boxes
+
+def find_right_bbox_ext_line(this_bbox, all_bboxes) -> list:
+    """
+    寻找this_bbox右边的所有bbox, 使用延长线
+    """
+    right_boxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX]]
+    if len(right_boxes):
+        right_boxes.sort(key=lambda x: x[X0_IDX])
+        right_boxes = right_boxes[0]
+    else:
+        right_boxes = None
+    return right_boxes
+
+# =============================================================================================
+
+def find_left_nearest_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有右侧高度和this_bbox有重叠的bbox， 不用延长线并且不能像
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] and any([
+         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
+        
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
+    if len(left_boxes) > 0:
+        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
+        left_boxes = left_boxes[0]
+    else:
+        left_boxes = None
+    return left_boxes
+
+def find_right_nearst_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] and any([
+        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]])]
+    
+    if len(right_bboxes)>0:
+        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
+        right_bboxes = right_bboxes[0]
+    else:
+        right_bboxes = None
+    return right_bboxes
+
+def reset_idx_x_y(all_boxes:list)->list:
+    for box in all_boxes:
+        box[IDX_X] = None
+        box[IDX_Y] = None
+        
+    return all_boxes
+
+# ===================================================================================================
+def find_top_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
+    """
+    找到在this_bbox上方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in bboxes_collection if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    # 然后再过滤一下，找到上方距离this_bbox最近的那个
+    if len(top_bboxes) > 0:
+        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+
+def find_bottom_nearest_bbox_direct(this_bbox, bboxes_collection) -> list:
+    """
+    找到在this_bbox下方且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in bboxes_collection if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+         this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个
+    if len(bottom_bboxes) > 0:
+        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+
+def find_boundry_bboxes(bboxes:list) -> tuple:
+    """
+    找到bboxes的边界——找到所有bbox里最小的(x0, y0), 最大的(x1, y1)
+    """
+    x0, y0, x1, y1 = bboxes[0][X0_IDX], bboxes[0][Y0_IDX], bboxes[0][X1_IDX], bboxes[0][Y1_IDX]
+    for box in bboxes:
+        x0 = min(box[X0_IDX], x0)
+        y0 = min(box[Y0_IDX], y0)
+        x1 = max(box[X1_IDX], x1)
+        y1 = max(box[Y1_IDX], y1)
+        
+    return x0, y0, x1, y1
+    
+
+def extend_bbox_vertical(bboxes:list, boundry_x0, boundry_y0, boundry_x1, boundry_y1) -> list:
+    """
+    在垂直方向上扩展能够直接垂直打通的bbox,也就是那些上下都没有其他box的bbox
+    """
+    for box in bboxes:
+        top_nearest_bbox = find_top_nearest_bbox_direct(box, bboxes)
+        bottom_nearest_bbox = find_bottom_nearest_bbox_direct(box, bboxes)
+        if top_nearest_bbox is None and bottom_nearest_bbox is None: # 独占一列
+            box[X0_EXT_IDX] = box[X0_IDX]
+            box[Y0_EXT_IDX] = boundry_y0
+            box[X1_EXT_IDX] = box[X1_IDX]
+            box[Y1_EXT_IDX] = boundry_y1
+        # else:
+        #     if top_nearest_bbox is None:
+        #         box[Y0_EXT_IDX] = boundry_y0
+        #     else:
+        #         box[Y0_EXT_IDX] = top_nearest_bbox[Y1_IDX] + 1
+        #     if bottom_nearest_bbox is None:
+        #         box[Y1_EXT_IDX] = boundry_y1
+        #     else:
+        #         box[Y1_EXT_IDX] = bottom_nearest_bbox[Y0_IDX] - 1
+        #     box[X0_EXT_IDX] = box[X0_IDX]
+        #     box[X1_EXT_IDX] = box[X1_IDX]
+    return bboxes
+    
+
+# ===================================================================================================
+
+def paper_bbox_sort_v2(all_bboxes: list, page_width:int, page_height:int):
+    """
+    增加预处理行为的排序:
+    return:
+    [
+        {
+            "layout_bbox": [x0, y0, x1, y1],
+            "layout_label":"GOOD_LAYOUT/BAD_LAYOUT",
+            "content_bboxes": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
+        }
+    ]
+    """
+    sorted_layouts = [] # 最后的返回结果
+    page_x0, page_y0, page_x1, page_y1 = 1, 1, page_width-1, page_height-1
+    
+    all_bboxes = paper_bbox_sort(all_bboxes) # 大致拍下序
+    # 首先在水平方向上扩展独占一行的bbox
+    for bbox in all_bboxes:
+        left_nearest_bbox = find_left_nearest_bbox_direct(bbox, all_bboxes) # 非扩展线
+        right_nearest_bbox = find_right_nearst_bbox_direct(bbox, all_bboxes)
+        if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
+            bbox[X0_EXT_IDX] = page_x0
+            bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
+            bbox[X1_EXT_IDX] = page_x1
+            bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
+            
+    # 此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
+    if len(all_bboxes)==1:
+        return [{"layout_bbox": [page_x0, page_y0, page_x1, page_y1], "layout_label":"GOOD_LAYOUT", "content_bboxes": all_bboxes}]
+    if len(all_bboxes)==0:
+        return []
+    
+    """
+    然后合并所有连续水平方向的bbox.
+    
+    """
+    all_bboxes.sort(key=lambda x: x[Y0_IDX])
+    h_bboxes = []
+    h_bbox_group = []
+    v_boxes = []
+
+    for bbox in all_bboxes:
+        if bbox[X0_IDX] == page_x0 and bbox[X1_IDX] == page_x1:
+            h_bbox_group.append(bbox)
+        else:
+            if len(h_bbox_group)>0:
+                h_bboxes.append(h_bbox_group) 
+                h_bbox_group = []
+    # 最后一个group
+    if len(h_bbox_group)>0:
+        h_bboxes.append(h_bbox_group)
+
+    """
+    现在h_bboxes里面是所有的group了，每个group都是一个list
+    对h_bboxes里的每个group进行计算放回到sorted_layouts里
+    """
+    for gp in h_bboxes:
+        gp.sort(key=lambda x: x[Y0_IDX])
+        block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
+        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
+        x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
+        block_info["layout_bbox"] = [x0, y0, x1, y1]
+        sorted_layouts.append(block_info)
+        
+    # 接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
+    h_split_lines = [page_y0]
+    for gp in h_bboxes:
+        layout_bbox = gp['layout_bbox']
+        y0, y1 = layout_bbox[1], layout_bbox[3]
+        h_split_lines.append(y0)
+        h_split_lines.append(y1)
+    h_split_lines.append(page_y1)
+    
+    unsplited_bboxes = []
+    for i in range(0, len(h_split_lines), 2):
+        start_y0, start_y1 = h_split_lines[i:i+2]
+        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
+        bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
+        unsplited_bboxes.append(bboxes_in_block)
+    # ================== 至此，水平方向的 已经切分排序完毕====================================
+    """
+    接下来针对每个非水平的部分切分垂直方向的
+    此时，只剩下了无法被完全水平打通的bbox了。对这些box，优先进行垂直扩展，然后进行垂直切分.
+    分3步：
+    1. 先把能完全垂直打通的隔离出去当做一个layout
+    2. 其余的先垂直切分
+    3. 垂直切分之后的部分再尝试水平切分
+    4. 剩下的不能被切分的各个部分当成一个layout
+    """
+    # 对每部分进行垂直切分
+    for bboxes_in_block in unsplited_bboxes:
+        # 首先对这个block的bbox进行垂直方向上的扩展
+        boundry_x0, boundry_y0, boundry_x1, boundry_y1 = find_boundry_bboxes(bboxes_in_block) 
+        # 进行垂直方向上的扩展
+        extended_vertical_bboxes = extend_bbox_vertical(bboxes_in_block, boundry_x0, boundry_y0, boundry_x1, boundry_y1)
+        # 然后对这个block进行垂直方向上的切分
+        extend_bbox_vertical.sort(key=lambda x: x[X0_IDX]) # x方向上从小到大，代表了从左到右读取
+        v_boxes_group = []
+        for bbox in extended_vertical_bboxes:
+            if bbox[Y0_IDX]==boundry_y0 and bbox[Y1_IDX]==boundry_y1:
+                v_boxes_group.append(bbox)
+            else:
+                if len(v_boxes_group)>0:
+                    v_boxes.append(v_boxes_group)
+                    v_boxes_group = []
+                    
+        if len(v_boxes_group)>0:
+            
+            v_boxes.append(v_boxes_group)
+            
+        # 把连续的垂直部分加入到sorted_layouts里。注意这个时候已经是连续的垂直部分了，因为上面已经做了
+        for gp in v_boxes:
+            gp.sort(key=lambda x: x[X0_IDX])
+            block_info = {"layout_label":"GOOD_LAYOUT", "content_bboxes": gp}
+            # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
+            x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
+            block_info["layout_bbox"] = [x0, y0, x1, y1]
+            sorted_layouts.append(block_info)
+            
+        # 在垂直方向上，划分子块，也就是用贯通的垂直线进行切分。这些被切分出来的块，极大可能是可被垂直切分的，如果不能完全的垂直切分，那么尝试水平切分。都不能的则当成一个layout
+        v_split_lines = [boundry_x0]
+        for gp in v_boxes:
+            layout_bbox = gp['layout_bbox']
+            x0, x1 = layout_bbox[0], layout_bbox[2]
+            v_split_lines.append(x0)
+            v_split_lines.append(x1)
+        v_split_lines.append(boundry_x1)
+        
+    reset_idx_x_y(all_bboxes)
+    all_boxes = _paper_bbox_sort_ext(all_bboxes)
+    return all_boxes
+            
+    
+    
+    
+    
+
+
+
diff --git a/magic_pdf/layout/layout_det_utils.py b/magic_pdf/layout/layout_det_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b2b36cc071c9726bdb6ca03a3e5b98ded4eeb24
--- /dev/null
+++ b/magic_pdf/layout/layout_det_utils.py
@@ -0,0 +1,182 @@
+from magic_pdf.layout.bbox_sort import X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX
+from magic_pdf.libs.boxbase import _is_bottom_full_overlap, _left_intersect, _right_intersect
+
+
+def find_all_left_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    在all_bboxes里找到所有右侧垂直方向上和this_bbox有重叠的bbox， 不用延长线
+    并且要考虑两个box左右相交的情况，如果相交了，那么右侧的box就不算最左侧。
+    """
+    left_boxes = [box for box in all_bboxes if box[X1_IDX] <= this_bbox[X0_IDX] 
+         and any([
+         box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+         this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+         box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _left_intersect(box[:4], this_bbox[:4])]
+        
+    # 然后再过滤一下，找到水平上距离this_bbox最近的那个——x1最大的那个
+    if len(left_boxes) > 0:
+        left_boxes.sort(key=lambda x: x[X1_EXT_IDX] if x[X1_EXT_IDX] else x[X1_IDX], reverse=True)
+        left_boxes = left_boxes[0]
+    else:
+        left_boxes = None
+    return left_boxes
+
+def find_all_right_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox右侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    right_bboxes = [box for box in all_bboxes if box[X0_IDX] >= this_bbox[X1_IDX] 
+        and any([
+        this_bbox[Y0_IDX] < box[Y0_IDX] < this_bbox[Y1_IDX], this_bbox[Y0_IDX] < box[Y1_IDX] < this_bbox[Y1_IDX],
+        box[Y0_IDX] < this_bbox[Y0_IDX] < box[Y1_IDX], box[Y0_IDX] < this_bbox[Y1_IDX] < box[Y1_IDX],
+        box[Y0_IDX]==this_bbox[Y0_IDX] and box[Y1_IDX]==this_bbox[Y1_IDX]]) or _right_intersect(this_bbox[:4], box[:4])]
+    
+    if len(right_bboxes)>0:
+        right_bboxes.sort(key=lambda x: x[X0_EXT_IDX] if x[X0_EXT_IDX] else x[X0_IDX])
+        right_bboxes = right_bboxes[0]
+    else:
+        right_bboxes = None
+    return right_bboxes
+
+def find_all_top_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    
+    if len(top_bboxes)>0:
+        top_bboxes.sort(key=lambda x: x[Y1_EXT_IDX] if x[Y1_EXT_IDX] else x[Y1_IDX], reverse=True)
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+
+def find_all_bottom_bbox_direct(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    
+    if len(bottom_bboxes)>0:
+        bottom_bboxes.sort(key=lambda x:  x[Y0_IDX])
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+
+# ===================================================================================================================
+def find_bottom_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    
+    if len(bottom_bboxes)>0:
+        # y0最小， X1最大的那个,也就是box上边缘最靠近this_bbox的那个,并且还最靠右
+        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
+        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
+        # 然后再y1相同的情况下，找到x1最大的那个
+        bottom_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+
+def find_bottom_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox下侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    bottom_bboxes = [box for box in all_bboxes if box[Y0_IDX] >= this_bbox[Y1_IDX] and any([
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    
+    if len(bottom_bboxes)>0:
+        # y0最小， X0最小的那个
+        bottom_bboxes.sort(key=lambda x: x[Y0_IDX])
+        bottom_bboxes = [box for box in bottom_bboxes if box[Y0_IDX]==bottom_bboxes[0][Y0_IDX]]
+        # 然后再y0相同的情况下，找到x0最小的那个
+        bottom_bboxes.sort(key=lambda x: x[X0_IDX])
+        bottom_bboxes = bottom_bboxes[0]
+    else:
+        bottom_bboxes = None
+    return bottom_bboxes
+
+def find_top_bbox_direct_from_left_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    
+    if len(top_bboxes)>0:
+        # y1最大， X0最小的那个
+        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
+        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
+        # 然后再y1相同的情况下，找到x0最小的那个
+        top_bboxes.sort(key=lambda x: x[X0_IDX])
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+
+def find_top_bbox_direct_from_right_edge(this_bbox, all_bboxes) -> list:
+    """
+    找到在this_bbox上侧且距离this_bbox距离最近的bbox.必须是直接遮挡的那种
+    """
+    top_bboxes = [box for box in all_bboxes if box[Y1_IDX] <= this_bbox[Y0_IDX] and any([
+        box[X0_IDX] < this_bbox[X0_IDX] < box[X1_IDX], box[X0_IDX] < this_bbox[X1_IDX] < box[X1_IDX],
+        this_bbox[X0_IDX] < box[X0_IDX] < this_bbox[X1_IDX], this_bbox[X0_IDX] < box[X1_IDX] < this_bbox[X1_IDX],
+        box[X0_IDX]==this_bbox[X0_IDX] and box[X1_IDX]==this_bbox[X1_IDX]])]
+    
+    if len(top_bboxes)>0:
+        # y1最大， X1最大的那个
+        top_bboxes.sort(key=lambda x: x[Y1_IDX], reverse=True)
+        top_bboxes = [box for box in top_bboxes if box[Y1_IDX]==top_bboxes[0][Y1_IDX]]
+        # 然后再y1相同的情况下，找到x1最大的那个
+        top_bboxes.sort(key=lambda x: x[X1_IDX], reverse=True)
+        top_bboxes = top_bboxes[0]
+    else:
+        top_bboxes = None
+    return top_bboxes
+    
+# ===================================================================================================================
+
+def get_left_edge_bboxes(all_bboxes) -> list:
+    """
+    返回最左边的bbox
+    """
+    left_bboxes = [box for box in all_bboxes if find_all_left_bbox_direct(box, all_bboxes) is None]
+    return left_bboxes
+    
+def get_right_edge_bboxes(all_bboxes) -> list:
+    """
+    返回最右边的bbox
+    """
+    right_bboxes = [box for box in all_bboxes if find_all_right_bbox_direct(box, all_bboxes) is None]
+    return right_bboxes
+
+def fix_vertical_bbox_pos(bboxes:list):
+    """
+    检查这批bbox在垂直方向是否有轻微的重叠，如果重叠了，就把重叠的bbox往下移动一点
+    在x方向上必须一个包含或者被包含，或者完全重叠，不能只有部分重叠
+    """
+    bboxes.sort(key=lambda x: x[Y0_IDX]) # 从上向下排列
+    for i in range(0, len(bboxes)):
+        for j in range(i+1, len(bboxes)):
+            if _is_bottom_full_overlap(bboxes[i][:4], bboxes[j][:4]):
+                # 如果两个bbox有部分重叠，那么就把下面的bbox往下移动一点
+                bboxes[j][Y0_IDX] = bboxes[i][Y1_IDX] + 2 # 2是个经验值
+                break
+    return bboxes
diff --git a/magic_pdf/layout/layout_sort.py b/magic_pdf/layout/layout_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b387d4c97e318aaa831b92a5238a5f9d67e7d11
--- /dev/null
+++ b/magic_pdf/layout/layout_sort.py
@@ -0,0 +1,732 @@
+"""
+对pdf上的box进行layout识别，并对内部组成的box进行排序
+"""
+
+from loguru import logger
+from magic_pdf.layout.bbox_sort import CONTENT_IDX, CONTENT_TYPE_IDX, X0_EXT_IDX, X0_IDX, X1_EXT_IDX, X1_IDX, Y0_EXT_IDX, Y0_IDX, Y1_EXT_IDX, Y1_IDX, paper_bbox_sort
+from magic_pdf.layout.layout_det_utils import find_all_left_bbox_direct, find_all_right_bbox_direct, find_bottom_bbox_direct_from_left_edge, find_bottom_bbox_direct_from_right_edge, find_top_bbox_direct_from_left_edge, find_top_bbox_direct_from_right_edge, find_all_top_bbox_direct, find_all_bottom_bbox_direct, get_left_edge_bboxes, get_right_edge_bboxes
+from magic_pdf.libs.boxbase import get_bbox_in_boundry
+
+
+LAYOUT_V = "V"
+LAYOUT_H = "H"
+LAYOUT_UNPROC = "U"
+LAYOUT_BAD = "B"
+
+def _is_single_line_text(bbox):
+    """
+    检查bbox里面的文字是否只有一行
+    """
+    return True # TODO 
+    box_type = bbox[CONTENT_TYPE_IDX]
+    if box_type != 'text':
+        return False
+    paras = bbox[CONTENT_IDX]["paras"]
+    text_content = ""
+    for para_id, para in paras.items():  # 拼装内部的段落文本
+        is_title = para['is_title']
+        if is_title!=0:
+            text_content += f"## {para['text']}"
+        else:
+            text_content += para["text"]
+        text_content += "\n\n"
+                
+    return bbox[CONTENT_TYPE_IDX] == 'text' and len(text_content.split("\n\n")) <= 1
+
+
+def _horizontal_split(bboxes:list, boundry:tuple, avg_font_size=20)-> list:
+    """
+    对bboxes进行水平切割
+    方法是：找到左侧和右侧都没有被直接遮挡的box，然后进行扩展，之后进行切割
+    return:
+        返回几个大的Layout区域 [[x0, y0, x1, y1, "h|u|v"], ], h代表水平，u代表未探测的，v代表垂直布局
+    """
+    sorted_layout_blocks = [] # 这是要最终返回的值
+    
+    bound_x0, bound_y0, bound_x1, bound_y1 = boundry
+    all_bboxes = get_bbox_in_boundry(bboxes, boundry)
+    #all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
+    """
+    首先在水平方向上扩展独占一行的bbox
+    
+    """
+    last_h_split_line_y1 = bound_y0 #记录下上次的水平分割线
+    for i, bbox in enumerate(all_bboxes):
+        left_nearest_bbox = find_all_left_bbox_direct(bbox, all_bboxes) # 非扩展线
+        right_nearest_bbox = find_all_right_bbox_direct(bbox, all_bboxes)
+        if left_nearest_bbox is None and right_nearest_bbox is None: # 独占一行
+            """
+            然而，如果只是孤立的一行文字，那么就还要满足以下几个条件才可以：
+            1. bbox和中心线相交。或者
+            2. 上方或者下方也存在同类水平的独占一行的bbox。 或者
+            3. TODO 加强条件：这个bbox上方和下方是同一列column，那么就不能算作独占一行
+            """
+            # 先检查这个bbox里是否只包含一行文字
+            is_single_line =  _is_single_line_text(bbox)
+            """
+            这里有个点需要注意，当页面内容不是居中的时候，第一次调用传递的是page的boundry，这个时候mid_x就不是中心线了.
+            所以这里计算出最紧致的boundry，然后再计算mid_x
+            """
+            boundry_real_x0, boundry_real_x1 = min([bbox[X0_IDX] for bbox in all_bboxes]), max([bbox[X1_IDX] for bbox in all_bboxes])
+            mid_x = (boundry_real_x0+boundry_real_x1)/2  
+            # 检查这个box是否内容在中心线有交
+            # 必须跨过去2个字符的宽度
+            is_cross_boundry_mid_line = min(mid_x-bbox[X0_IDX], bbox[X1_IDX]-mid_x) > avg_font_size*2
+            """
+            检查条件2
+            """
+            is_belong_to_col = False
+            """
+            检查是否能被上方col吸收，方法是：
+            1. 上方非空且不是独占一行的，并且
+            2. 从上个水平分割的最大y=y1开始到当前bbox,最左侧的bbox的[min_x0, max_x1],能够覆盖当前box的[x0, x1]
+            """
+            """
+            以迭代的方式向上找，查找范围是[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]
+            """
+            #先确定上方的y0, y0
+            b_y0, b_y1 = last_h_split_line_y1, bbox[Y0_IDX]
+            #然后从box开始逐个向上找到所有与box在x上有交集的box
+            box_to_check = [bound_x0, b_y0, bound_x1, b_y1]
+            bbox_in_bound_check = get_bbox_in_boundry(all_bboxes, box_to_check)
+            
+            bboxes_on_top = []
+            virtual_box = bbox
+            while True:
+                b_on_top = find_all_top_bbox_direct(virtual_box, bbox_in_bound_check)
+                if b_on_top is not None:
+                    bboxes_on_top.append(b_on_top)
+                    virtual_box = [min([virtual_box[X0_IDX], b_on_top[X0_IDX]]), min(virtual_box[Y0_IDX], b_on_top[Y0_IDX]), max([virtual_box[X1_IDX], b_on_top[X1_IDX]]), b_y1]
+                else:
+                    break
+
+            # 随后确定这些box的最小x0, 最大x1
+            if len(bboxes_on_top)>0 and len(bboxes_on_top) != len(bbox_in_bound_check):# virtual_box可能会膨胀到占满整个区域，这实际上就不能属于一个col了。
+                min_x0, max_x1 = virtual_box[X0_IDX], virtual_box[X1_IDX]
+                # 然后采用一种比较粗糙的方法，看min_x0，max_x1是否与位于[bound_x0, last_h_sp, bound_x1, bbox[Y0_IDX]]之间的box有相交
+                
+                if not any([b[X0_IDX] <= min_x0-1 <= b[X1_IDX] or b[X0_IDX] <= max_x1+1 <= b[X1_IDX] for b in bbox_in_bound_check]):
+                    # 其上，下都不能被扩展成行，暂时只检查一下上方 TODO
+                    top_nearest_bbox = find_all_top_bbox_direct(bbox, bboxes)
+                    bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, bboxes)
+                    if not any([
+                        top_nearest_bbox is not None and (find_all_left_bbox_direct(top_nearest_bbox, bboxes) is  None and  find_all_right_bbox_direct(top_nearest_bbox, bboxes) is None),
+                        bottom_nearest_bbox is not None and (find_all_left_bbox_direct(bottom_nearest_bbox, bboxes) is  None and  find_all_right_bbox_direct(bottom_nearest_bbox, bboxes) is None),
+                        top_nearest_bbox is None or bottom_nearest_bbox is None
+                        ]):
+                            is_belong_to_col = True
+                
+            # 检查是否能被下方col吸收 TODO
+            
+            """
+            这里为什么没有is_cross_boundry_mid_line的条件呢？
+            确实有些杂志左右两栏宽度不是对称的。
+            """
+            if not is_belong_to_col or is_cross_boundry_mid_line:
+                bbox[X0_EXT_IDX] = bound_x0
+                bbox[Y0_EXT_IDX] = bbox[Y0_IDX]
+                bbox[X1_EXT_IDX] = bound_x1
+                bbox[Y1_EXT_IDX] = bbox[Y1_IDX]
+                last_h_split_line_y1 = bbox[Y1_IDX] # 更新这条线
+            else:
+                continue
+    """
+    此时独占一行的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
+    然后合并所有连续水平方向的bbox.
+    """
+    all_bboxes.sort(key=lambda x: x[Y0_IDX])
+    h_bboxes = []
+    h_bbox_group = []
+
+    for bbox in all_bboxes:
+        if bbox[X0_EXT_IDX] == bound_x0 and bbox[X1_EXT_IDX] == bound_x1:
+            h_bbox_group.append(bbox)
+        else:
+            if len(h_bbox_group)>0:
+                h_bboxes.append(h_bbox_group) 
+                h_bbox_group = []
+    # 最后一个group
+    if len(h_bbox_group)>0:
+        h_bboxes.append(h_bbox_group)
+
+    """
+    现在h_bboxes里面是所有的group了，每个group都是一个list
+    对h_bboxes里的每个group进行计算放回到sorted_layouts里
+    """
+    h_layouts = []
+    for gp in h_bboxes:
+        gp.sort(key=lambda x: x[Y0_IDX])
+        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
+        x0, y0, x1, y1 = gp[0][X0_EXT_IDX], gp[0][Y0_EXT_IDX], gp[-1][X1_EXT_IDX], gp[-1][Y1_EXT_IDX]
+        h_layouts.append([x0, y0, x1, y1, LAYOUT_H]) # 水平的布局
+        
+    """
+    接下来利用这些连续的水平bbox的layout_bbox的y0, y1，从水平上切分开其余的为几个部分
+    """
+    h_split_lines = [bound_y0]
+    for gp in h_bboxes: # gp是一个list[bbox_list]
+        y0, y1 = gp[0][1], gp[-1][3]
+        h_split_lines.append(y0)
+        h_split_lines.append(y1)
+    h_split_lines.append(bound_y1)
+    
+    unsplited_bboxes = []
+    for i in range(0, len(h_split_lines), 2):
+        start_y0, start_y1 = h_split_lines[i:i+2]
+        # 然后找出[start_y0, start_y1]之间的其他bbox，这些组成一个未分割板块
+        bboxes_in_block = [bbox for bbox in all_bboxes if bbox[Y0_IDX]>=start_y0 and bbox[Y1_IDX]<=start_y1]
+        unsplited_bboxes.append(bboxes_in_block)
+    # 接着把未处理的加入到h_layouts里
+    for bboxes_in_block in unsplited_bboxes:
+        if len(bboxes_in_block) == 0:
+            continue
+        x0, y0, x1, y1 = bound_x0, min([bbox[Y0_IDX] for bbox in bboxes_in_block]), bound_x1, max([bbox[Y1_IDX] for bbox in bboxes_in_block])
+        h_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC])
+        
+    h_layouts.sort(key=lambda x: x[1]) # 按照y0排序, 也就是从上到下的顺序
+    
+    """
+    转换成如下格式返回
+    """
+    for layout in h_layouts:
+        sorted_layout_blocks.append({
+            "layout_bbox": layout[:4],
+            "layout_label":layout[4],
+            "sub_layout":[],
+        })
+    return sorted_layout_blocks
+   
+###############################################################################################
+#
+#  垂直方向的处理
+#
+#
+############################################################################################### 
+def _vertical_align_split_v1(bboxes:list, boundry:tuple)-> list:
+    """
+    计算垂直方向上的对齐， 并分割bboxes成layout。负责对一列多行的进行列维度分割。
+    如果不能完全分割，剩余部分作为layout_lable为u的layout返回
+    -----------------------
+    |     |           |
+    |     |           |
+    |     |           |
+    |     |           |
+    -------------------------
+    此函数会将：以上布局将会切分出来2列
+    """
+    sorted_layout_blocks = [] # 这是要最终返回的值
+    new_boundry = [boundry[0], boundry[1], boundry[2], boundry[3]]
+    
+    v_blocks = []
+    """
+    先从左到右切分
+    """
+    while True: 
+        all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
+        left_edge_bboxes = get_left_edge_bboxes(all_bboxes)
+        if len(left_edge_bboxes) == 0:
+            break
+        right_split_line_x1 = max([bbox[X1_IDX] for bbox in left_edge_bboxes])+1
+        # 然后检查这条线能不与其他bbox的左边界相交或者重合
+        if any([bbox[X0_IDX] <= right_split_line_x1 <= bbox[X1_IDX] for bbox in all_bboxes]):
+            # 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
+            break
+        else: # 说明成功分割出一列
+            # 找到左侧边界最靠左的bbox作为layout的x0
+            layout_x0 = min([bbox[X0_IDX] for bbox in left_edge_bboxes]) # 这里主要是为了画出来有一定间距
+            v_blocks.append([layout_x0, new_boundry[1], right_split_line_x1, new_boundry[3], LAYOUT_V])
+            new_boundry[0] = right_split_line_x1 # 更新边界
+            
+    """
+    再从右到左切， 此时如果还是无法完全切分，那么剩余部分作为layout_lable为u的layout返回
+    """
+    unsplited_block = []
+    while True:
+        all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
+        right_edge_bboxes = get_right_edge_bboxes(all_bboxes)
+        if len(right_edge_bboxes) == 0:
+            break
+        left_split_line_x0 = min([bbox[X0_IDX] for bbox in right_edge_bboxes])-1
+        # 然后检查这条线能不与其他bbox的左边界相交或者重合
+        if any([bbox[X0_IDX] <= left_split_line_x0 <= bbox[X1_IDX] for bbox in all_bboxes]):
+            # 这里是余下的
+            unsplited_block.append([new_boundry[0], new_boundry[1], new_boundry[2], new_boundry[3], LAYOUT_UNPROC])
+            break
+        else:
+            # 找到右侧边界最靠右的bbox作为layout的x1
+            layout_x1 = max([bbox[X1_IDX] for bbox in right_edge_bboxes])
+            v_blocks.append([left_split_line_x0, new_boundry[1], layout_x1, new_boundry[3], LAYOUT_V])
+            new_boundry[2] = left_split_line_x0 # 更新右边界
+            
+    """
+    最后拼装成layout格式返回
+    """
+    for block in v_blocks:
+        sorted_layout_blocks.append({
+            "layout_bbox": block[:4],
+            "layout_label":block[4],
+            "sub_layout":[],
+        })
+    for block in unsplited_block:
+        sorted_layout_blocks.append({
+            "layout_bbox": block[:4],
+            "layout_label":block[4],
+            "sub_layout":[],
+        })
+    
+    # 按照x0排序
+    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
+    return sorted_layout_blocks
+            
+def _vertical_align_split_v2(bboxes:list, boundry:tuple)-> list:
+    """
+    改进的 _vertical_align_split算法，原算法会因为第二列的box由于左侧没有遮挡被认为是左侧的一部分，导致整个layout多列被识别为一列。
+    利用从左上角的box开始向下看的方法，不断扩展w_x0, w_x1，直到不能继续向下扩展，或者到达边界下边界。
+    """
+    sorted_layout_blocks = [] # 这是要最终返回的值
+    new_boundry = [boundry[0], boundry[1], boundry[2], boundry[3]]
+    bad_boxes = [] # 被割中的box
+    v_blocks = []
+    while True:
+        all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
+        if len(all_bboxes) == 0:
+            break
+        left_top_box = min(all_bboxes, key=lambda x: (x[X0_IDX],x[Y0_IDX]))# 这里应该加强，检查一下必须是在第一列的 TODO
+        start_box = [left_top_box[X0_IDX], left_top_box[Y0_IDX], left_top_box[X1_IDX], left_top_box[Y1_IDX]]
+        w_x0, w_x1 = left_top_box[X0_IDX], left_top_box[X1_IDX]
+        """
+        然后沿着这个box线向下找最近的那个box, 然后扩展w_x0, w_x1
+        扩展之后，宽度会增加，随后用x=w_x1来检测在边界内是否有box与相交，如果相交，那么就说明不能再扩展了。
+        当不能扩展的时候就要看是否到达下边界：
+        1. 达到，那么更新左边界继续分下一个列
+        2. 没有达到，那么此时开始从右侧切分进入下面的循环里
+        """
+        while left_top_box is not None: # 向下去找
+            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
+            left_top_box = find_bottom_bbox_direct_from_left_edge(virtual_box, all_bboxes)
+            if left_top_box:
+                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max([virtual_box[X1_IDX], left_top_box[X1_IDX]])
+        # 万一这个初始的box在column中间，那么还要向上看
+        start_box = [w_x0, start_box[Y0_IDX], w_x1, start_box[Y1_IDX]] # 扩展一下宽度更鲁棒
+        left_top_box = find_top_bbox_direct_from_left_edge(start_box, all_bboxes)
+        while left_top_box is not None: # 向上去找
+            virtual_box = [w_x0, left_top_box[Y0_IDX], w_x1, left_top_box[Y1_IDX]]
+            left_top_box = find_top_bbox_direct_from_left_edge(virtual_box, all_bboxes)
+            if left_top_box:
+                w_x0, w_x1 = min(virtual_box[X0_IDX], left_top_box[X0_IDX]), max([virtual_box[X1_IDX], left_top_box[X1_IDX]])
+        
+        # 检查相交  
+        if any([bbox[X0_IDX] <= w_x1+1 <= bbox[X1_IDX] for bbox in all_bboxes]):
+            for b in all_bboxes:
+                if b[X0_IDX] <= w_x1+1 <= b[X1_IDX]:
+                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
+            break
+        else: # 说明成功分割出一列
+            v_blocks.append([w_x0, new_boundry[1], w_x1, new_boundry[3], LAYOUT_V])
+            new_boundry[0] = w_x1 # 更新边界
+    
+    """
+    接着开始从右上角的box扫描
+    """
+    w_x0 , w_x1 = 0, 0
+    unsplited_block = []
+    while True:
+        all_bboxes = get_bbox_in_boundry(bboxes, new_boundry)
+        if len(all_bboxes) == 0:
+            break
+        # 先找到X1最大的
+        bbox_list_sorted = sorted(all_bboxes, key=lambda bbox: bbox[X1_IDX], reverse=True)
+        # Then, find the boxes with the smallest Y0 value
+        bigest_x1 = bbox_list_sorted[0][X1_IDX]
+        boxes_with_bigest_x1 = [bbox for bbox in bbox_list_sorted if bbox[X1_IDX] == bigest_x1] # 也就是最靠右的那些
+        right_top_box = min(boxes_with_bigest_x1, key=lambda bbox: bbox[Y0_IDX]) # y0最小的那个
+        start_box = [right_top_box[X0_IDX], right_top_box[Y0_IDX], right_top_box[X1_IDX], right_top_box[Y1_IDX]]
+        w_x0, w_x1 = right_top_box[X0_IDX], right_top_box[X1_IDX]
+        
+        while right_top_box is not None:
+            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
+            right_top_box = find_bottom_bbox_direct_from_right_edge(virtual_box, all_bboxes)
+            if right_top_box:
+                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max([w_x1, right_top_box[X1_IDX]])
+        # 在向上扫描
+        start_box = [w_x0, start_box[Y0_IDX], w_x1, start_box[Y1_IDX]] # 扩展一下宽度更鲁棒
+        right_top_box = find_top_bbox_direct_from_right_edge(start_box, all_bboxes)
+        while right_top_box is not None:
+            virtual_box = [w_x0, right_top_box[Y0_IDX], w_x1, right_top_box[Y1_IDX]]
+            right_top_box = find_top_bbox_direct_from_right_edge(virtual_box, all_bboxes)
+            if right_top_box:
+                w_x0, w_x1 = min([w_x0, right_top_box[X0_IDX]]), max([w_x1, right_top_box[X1_IDX]])
+                
+        # 检查是否与其他box相交， 垂直切分线与某些box发生相交，说明无法完全垂直方向切分。
+        if any([bbox[X0_IDX] <= w_x0-1 <= bbox[X1_IDX] for bbox in all_bboxes]):
+            unsplited_block.append([new_boundry[0], new_boundry[1], new_boundry[2], new_boundry[3], LAYOUT_UNPROC])
+            for b in all_bboxes:
+                if b[X0_IDX] <= w_x0-1 <= b[X1_IDX]:
+                    bad_boxes.append([b[X0_IDX], b[Y0_IDX], b[X1_IDX], b[Y1_IDX]])
+            break
+        else: # 说明成功分割出一列
+            v_blocks.append([w_x0, new_boundry[1], w_x1, new_boundry[3], LAYOUT_V])
+            new_boundry[2] = w_x0
+    
+    """转换数据结构"""
+    for block in v_blocks:
+        sorted_layout_blocks.append({
+            "layout_bbox": block[:4],
+            "layout_label":block[4],
+            "sub_layout":[],
+        })
+        
+    for block in unsplited_block:
+        sorted_layout_blocks.append({
+            "layout_bbox": block[:4],
+            "layout_label":block[4],
+            "sub_layout":[],
+            "bad_boxes": bad_boxes # 记录下来，这个box是被割中的
+        })
+        
+        
+    # 按照x0排序
+    sorted_layout_blocks.sort(key=lambda x: x['layout_bbox'][0])
+    return sorted_layout_blocks
+                
+    
+
+
+def _try_horizontal_mult_column_split(bboxes:list, boundry:tuple)-> list:
+    """
+    尝试水平切分，如果切分不动，那就当一个BAD_LAYOUT返回
+    ------------------
+    |        |       |
+    ------------------
+    |    |       |   |   <-  这里是此函数要切分的场景
+    ------------------
+    |        |       |
+    |        |       |
+    """
+    pass
+
+
+
+
+def _vertical_split(bboxes:list, boundry:tuple)-> list:
+    """
+    从垂直方向进行切割，分block
+    这个版本里，如果垂直切分不动，那就当一个BAD_LAYOUT返回
+    
+                                --------------------------
+                                    |        |       |
+                                    |        |       |
+                                | |
+    这种列是此函数要切分的  ->    | |    
+                                | |
+                                    |        |       |
+                                    |        |       |
+                                -------------------------
+    """
+    sorted_layout_blocks = [] # 这是要最终返回的值
+    
+    bound_x0, bound_y0, bound_x1, bound_y1 = boundry
+    all_bboxes = get_bbox_in_boundry(bboxes, boundry)
+    """
+    all_bboxes = fix_vertical_bbox_pos(all_bboxes) # 垂直方向解覆盖
+    all_bboxes = fix_hor_bbox_pos(all_bboxes)  # 水平解覆盖
+    
+    这两行代码目前先不执行，因为公式检测，表格检测还不是很成熟，导致非常多的textblock参与了运算，时间消耗太大。
+    这两行代码的作用是：
+    如果遇到互相重叠的bbox, 那么会把面积较小的box进行压缩，从而避免重叠。对布局切分来说带来正反馈。
+    """
+    
+    #all_bboxes = paper_bbox_sort(all_bboxes, abs(bound_x1-bound_x0), abs(bound_y1-bound_x0)) # 大致拍下序, 这个是基于直接遮挡的。
+    """
+    首先在垂直方向上扩展独占一行的bbox
+    
+    """
+    for bbox in all_bboxes:
+        top_nearest_bbox = find_all_top_bbox_direct(bbox, all_bboxes) # 非扩展线
+        bottom_nearest_bbox = find_all_bottom_bbox_direct(bbox, all_bboxes)
+        if top_nearest_bbox is None and bottom_nearest_bbox is None  and not any([b[X0_IDX]<bbox[X1_IDX]<b[X1_IDX] or b[X0_IDX]<bbox[X0_IDX]<b[X1_IDX] for b in all_bboxes]): # 独占一列, 且不和其他重叠
+            bbox[X0_EXT_IDX] = bbox[X0_IDX]
+            bbox[Y0_EXT_IDX] = bound_y0
+            bbox[X1_EXT_IDX] = bbox[X1_IDX]
+            bbox[Y1_EXT_IDX] = bound_y1
+            
+    """
+    此时独占一列的被成功扩展到指定的边界上，这个时候利用边界条件合并连续的bbox，成为一个group
+    然后合并所有连续垂直方向的bbox.
+    """
+    all_bboxes.sort(key=lambda x: x[X0_IDX])
+    # fix: 这里水平方向的列不要合并成一个行，因为需要保证返回给下游的最小block，总是可以无脑从上到下阅读文字。
+    v_bboxes = []
+    for box in all_bboxes:
+        if box[Y0_EXT_IDX] == bound_y0 and box[Y1_EXT_IDX] == bound_y1: 
+            v_bboxes.append(box)
+    
+    """
+    现在v_bboxes里面是所有的group了，每个group都是一个list
+    对v_bboxes里的每个group进行计算放回到sorted_layouts里
+    """
+    v_layouts = []
+    for vbox in v_bboxes:
+        #gp.sort(key=lambda x: x[X0_IDX])
+        # 然后计算这个group的layout_bbox，也就是最小的x0,y0, 最大的x1,y1
+        x0, y0, x1, y1 = vbox[X0_EXT_IDX], vbox[Y0_EXT_IDX], vbox[X1_EXT_IDX], vbox[Y1_EXT_IDX]
+        v_layouts.append([x0, y0, x1, y1, LAYOUT_V]) # 垂直的布局
+        
+    """
+    接下来利用这些连续的垂直bbox的layout_bbox的x0, x1，从垂直上切分开其余的为几个部分
+    """
+    v_split_lines = [bound_x0]
+    for gp in v_bboxes:
+        x0, x1 = gp[X0_IDX], gp[X1_IDX]
+        v_split_lines.append(x0)
+        v_split_lines.append(x1)
+    v_split_lines.append(bound_x1)
+    
+    unsplited_bboxes = []
+    for i in range(0, len(v_split_lines), 2):
+        start_x0, start_x1 = v_split_lines[i:i+2]
+        # 然后找出[start_x0, start_x1]之间的其他bbox，这些组成一个未分割板块
+        bboxes_in_block = [bbox for bbox in all_bboxes if bbox[X0_IDX]>=start_x0 and bbox[X1_IDX]<=start_x1]
+        unsplited_bboxes.append(bboxes_in_block)
+    # 接着把未处理的加入到v_layouts里
+    for bboxes_in_block in unsplited_bboxes:
+        if len(bboxes_in_block) == 0:
+            continue
+        x0, y0, x1, y1 = min([bbox[X0_IDX] for bbox in bboxes_in_block]), bound_y0, max([bbox[X1_IDX] for bbox in bboxes_in_block]), bound_y1
+        v_layouts.append([x0, y0, x1, y1, LAYOUT_UNPROC]) # 说明这篇区域未能够分析出可靠的版面
+        
+    v_layouts.sort(key=lambda x: x[0]) # 按照x0排序, 也就是从左到右的顺序
+    
+    for layout in v_layouts:
+        sorted_layout_blocks.append({
+            "layout_bbox": layout[:4],
+            "layout_label":layout[4],
+            "sub_layout":[],
+        })
+        
+    """
+    至此，垂直方向切成了2种类型，其一是独占一列的，其二是未处理的。
+    下面对这些未处理的进行垂直方向切分，这个切分要切出来类似“吕”这种类型的垂直方向的布局
+    """
+    for i, layout in enumerate(sorted_layout_blocks):
+        if layout['layout_label'] == LAYOUT_UNPROC:
+            x0, y0, x1, y1 = layout['layout_bbox']
+            v_split_layouts = _vertical_align_split_v2(bboxes, [x0, y0, x1, y1])
+            sorted_layout_blocks[i] = {
+                "layout_bbox": [x0, y0, x1, y1],
+                "layout_label": LAYOUT_H,
+                "sub_layout": v_split_layouts
+            }
+            layout['layout_label'] = LAYOUT_H # 被垂线切分成了水平布局
+    
+    return sorted_layout_blocks
+    
+
+def split_layout(bboxes:list, boundry:tuple, page_num:int)-> list:
+    """
+    把bboxes切割成layout
+    return:
+    [
+        {
+            "layout_bbox": [x0, y0, x1, y1],
+            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
+            "sub_layout": [] #每个元素都是[x0, y0, x1, y1, block_content, idx_x, idx_y, content_type, ext_x0, ext_y0, ext_x1, ext_y1], 并且顺序就是阅读顺序
+        }
+    ]
+    example:
+    [
+        {
+            "layout_bbox": [0, 0, 100, 100],
+            "layout_label":"u|v|h|b",
+            "sub_layout":[
+                
+            ]
+        },
+        {
+            "layout_bbox": [0, 0, 100, 100],
+            "layout_label":"u|v|h|b",
+            "sub_layout":[
+                {
+                    "layout_bbox": [0, 0, 100, 100],
+                    "layout_label":"u|v|h|b",
+                    "content_bboxes":[
+                        [],
+                        [],
+                        []
+                    ]
+                },
+                {
+                    "layout_bbox": [0, 0, 100, 100],
+                    "layout_label":"u|v|h|b",
+                    "sub_layout":[
+                        
+                    ]
+                }
+        }
+    ]  
+    """
+    sorted_layouts = [] # 最终返回的结果
+    
+    boundry_x0, boundry_y0, boundry_x1, boundry_y1 = boundry
+    if len(bboxes) <=1:
+        return [
+            {
+                "layout_bbox": [boundry_x0, boundry_y0, boundry_x1, boundry_y1],
+                "layout_label": LAYOUT_V,
+                "sub_layout":[]
+            }
+        ]
+        
+    """
+    接下来按照先水平后垂直的顺序进行切分
+    """
+    bboxes = paper_bbox_sort(bboxes, boundry_x1-boundry_x0, boundry_y1-boundry_y0)
+    sorted_layouts = _horizontal_split(bboxes, boundry) # 通过水平分割出来的layout
+    for i, layout in enumerate(sorted_layouts):
+        x0, y0, x1, y1 = layout['layout_bbox']
+        layout_type = layout['layout_label']
+        if layout_type == LAYOUT_UNPROC: # 说明是非独占单行的，这些需要垂直切分
+            v_split_layouts = _vertical_split(bboxes, [x0, y0, x1, y1])
+            
+            """
+            最后这里有个逻辑问题：如果这个函数只分离出来了一个column layout，那么这个layout分割肯定超出了算法能力范围。因为我们假定的是传进来的
+            box已经把行全部剥离了，所以这里必须十多个列才可以。如果只剥离出来一个layout，并且是多个box，那么就说明这个layout是无法分割的，标记为LAYOUT_UNPROC
+            """
+            layout_label = LAYOUT_V
+            if len(v_split_layouts) == 1:
+                if len(v_split_layouts[0]['sub_layout']) == 0:
+                    layout_label = LAYOUT_UNPROC
+                    #logger.warning(f"WARNING: pageno={page_num}, 无法分割的layout: ", v_split_layouts)
+            
+            """
+            组合起来最终的layout
+            """
+            sorted_layouts[i] = {
+                "layout_bbox": [x0, y0, x1, y1],
+                "layout_label": layout_label,
+                "sub_layout": v_split_layouts
+            }
+            layout['layout_label'] = LAYOUT_H
+        
+    """
+    水平和垂直方向都切分完毕了。此时还有一些未处理的，这些未处理的可能是因为水平和垂直方向都无法切分。
+    这些最后调用_try_horizontal_mult_block_split做一次水平多个block的联合切分，如果也不能切分最终就当做BAD_LAYOUT返回
+    """
+    # TODO
+    
+    return sorted_layouts
+
+
+def get_bboxes_layout(all_boxes:list, boundry:tuple, page_id:int):
+    """
+    对利用layout排序之后的box，进行排序
+    return:
+    [
+        {
+            "layout_bbox": [x0, y0, x1, y1],
+            "layout_label":"u|v|h|b", 未处理|垂直|水平|BAD_LAYOUT
+        }，
+    ]
+    """
+    def _preorder_traversal(layout):
+        """
+        对sorted_layouts的叶子节点，也就是len(sub_layout)==0的节点进行排序。排序按照前序遍历的顺序，也就是从上到下，从左到右的顺序
+        """
+        sorted_layout_blocks = []
+        for layout in layout:
+            sub_layout = layout['sub_layout']
+            if len(sub_layout) == 0:
+                sorted_layout_blocks.append(layout)
+            else:
+                s = _preorder_traversal(sub_layout)
+                sorted_layout_blocks.extend(s)
+        return sorted_layout_blocks
+    # -------------------------------------------------------------------------------------------------------------------------
+    sorted_layouts = split_layout(all_boxes, boundry, page_id)# 先切分成layout，得到一个Tree
+    total_sorted_layout_blocks  = _preorder_traversal(sorted_layouts)
+    return total_sorted_layout_blocks, sorted_layouts
+
+
+def get_columns_cnt_of_layout(layout_tree):
+    """
+    获取一个layout的宽度
+    """
+    max_width_list = [0] # 初始化一个元素，防止max,min函数报错
+    
+    for items in layout_tree: # 针对每一层（横切）计算列数，横着的算一列
+        layout_type = items['layout_label']
+        sub_layouts = items['sub_layout']
+        if len(sub_layouts)==0:
+            max_width_list.append(1)
+        else:
+            if layout_type == LAYOUT_H:
+                max_width_list.append(1)
+            else:
+                width = 0
+                for l in sub_layouts:
+                    if len(l['sub_layout']) == 0:
+                        width += 1
+                    else:
+                        for lay in l['sub_layout']:
+                            width += get_columns_cnt_of_layout([lay])
+                max_width_list.append(width)
+                
+    return max(max_width_list)
+
+                
+    
+def sort_with_layout(bboxes:list, page_width, page_height) -> (list,list):
+    """
+    输入是一个bbox的list.
+    获取到输入之后，先进行layout切分，然后对这些bbox进行排序。返回排序后的bboxes
+    """
+
+    new_bboxes = []
+    for box in bboxes:
+        # new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
+        new_bboxes.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None, box[4]])
+    
+    layout_bboxes, _ = get_bboxes_layout(new_bboxes, [0, 0, page_width, page_height], 0)
+    if any([lay['layout_label']==LAYOUT_UNPROC for lay in layout_bboxes]):
+            logger.warning(f"drop this pdf, reason: 复杂版面")
+            return None,None
+        
+    sorted_bboxes = []    
+    # 利用layout bbox每次框定一些box，然后排序
+    for layout in layout_bboxes:
+        lbox = layout['layout_bbox']
+        bbox_in_layout = get_bbox_in_boundry(new_bboxes, lbox)
+        sorted_bbox = paper_bbox_sort(bbox_in_layout, lbox[2]-lbox[0], lbox[3]-lbox[1])
+        sorted_bboxes.extend(sorted_bbox)
+        
+    return sorted_bboxes, layout_bboxes
+
+
+def sort_text_block(text_block, layout_bboxes):
+    """
+    对一页的text_block进行排序
+    """
+    sorted_text_bbox = []
+    all_text_bbox = []
+    # 做一个box=>text的映射
+    box_to_text = {}
+    for blk in text_block:
+        box = blk['bbox']
+        box_to_text[(box[0], box[1], box[2], box[3])] = blk
+        all_text_bbox.append(box)
+        
+    # text_blocks_to_sort = []
+    # for box in box_to_text.keys():
+    #     text_blocks_to_sort.append([box[0], box[1], box[2], box[3], None, None, None, 'text', None, None, None, None])
+    
+    # 按照layout_bboxes的顺序，对text_block进行排序
+    for layout in layout_bboxes:
+        layout_box = layout['layout_bbox']
+        text_bbox_in_layout = get_bbox_in_boundry(all_text_bbox, [layout_box[0]-1, layout_box[1]-1, layout_box[2]+1, layout_box[3]+1])
+        #sorted_bbox = paper_bbox_sort(text_bbox_in_layout, layout_box[2]-layout_box[0], layout_box[3]-layout_box[1])
+        text_bbox_in_layout.sort(key = lambda x: x[1]) # 一个layout内部的box，按照y0自上而下排序
+        #sorted_bbox = [[b] for b in text_blocks_to_sort]
+        for sb in text_bbox_in_layout:
+            sorted_text_bbox.append(box_to_text[(sb[0], sb[1], sb[2], sb[3])])
+        
+    return sorted_text_bbox
diff --git a/magic_pdf/layout/layout_spiler_recog.py b/magic_pdf/layout/layout_spiler_recog.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea9d0410fa53acc9eed7d17985d0596eb7d643ac
--- /dev/null
+++ b/magic_pdf/layout/layout_spiler_recog.py
@@ -0,0 +1,101 @@
+"""
+找到能分割布局的水平的横线、色块
+"""
+
+import os
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+
+
+def __rect_filter_by_width(rect, page_w, page_h):
+    mid_x = page_w/2
+    if rect[0]< mid_x < rect[2]:
+        return True
+    return False
+
+
+def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
+    """
+    不能出现在table和image的位置
+    """
+    for box in image_bboxes:
+        if _is_in_or_part_overlap(rect, box):
+            return False
+    
+    for box in table_bboxes:
+        if _is_in_or_part_overlap(rect, box):
+            return False
+    
+    return True
+
+
+def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+    
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+        
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close() 
+    
+def get_spilter_of_page(page, image_bboxes, table_bboxes):
+    """
+    获取到色块和横线
+    """
+    cdrawings = page.get_cdrawings()
+    
+    spilter_bbox = []
+    for block in cdrawings:
+        if 'fill' in block:
+            fill = block['fill']
+        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
+            rect = block['rect']
+            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
+                spilter_bbox.append(list(rect))
+    
+    """过滤、修正一下这些box。因为有时候会有一些矩形，高度为0或者为负数，造成layout计算无限循环。如果是负高度或者0高度，统一修正为高度为1"""
+    for box in spilter_bbox:
+        if box[3]-box[1] <= 0:
+            box[3] = box[1] + 1
+            
+    #__debug_show_page(page, spilter_bbox, [], [])
+    
+    return spilter_bbox
diff --git a/magic_pdf/layout/mcol_sort.py b/magic_pdf/layout/mcol_sort.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0580c26a3fc6f60264bfccabd8ccb4cef87ae15
--- /dev/null
+++ b/magic_pdf/layout/mcol_sort.py
@@ -0,0 +1,336 @@
+"""
+This is an advanced PyMuPDF utility for detecting multi-column pages.
+It can be used in a shell script, or its main function can be imported and
+invoked as descript below.
+
+Features
+---------
+- Identify text belonging to (a variable number of) columns on the page.
+- Text with different background color is handled separately, allowing for
+  easier treatment of side remarks, comment boxes, etc.
+- Uses text block detection capability to identify text blocks and
+  uses the block bboxes as primary structuring principle.
+- Supports ignoring footers via a footer margin parameter.
+- Returns re-created text boundary boxes (integer coordinates), sorted ascending
+  by the top, then by the left coordinates.
+
+Restrictions
+-------------
+- Only supporting horizontal, left-to-right text
+- Returns a list of text boundary boxes - not the text itself. The caller is
+  expected to extract text from within the returned boxes.
+- Text written above images is ignored altogether (option).
+- This utility works as expected in most cases. The following situation cannot
+  be handled correctly:
+    * overlapping (non-disjoint) text blocks
+    * image captions are not recognized and are handled like normal text
+
+Usage
+------
+- As a CLI shell command use
+
+  python multi_column.py input.pdf footer_margin
+
+  Where footer margin is the height of the bottom stripe to ignore on each page.
+  This code is intended to be modified according to your need.
+
+- Use in a Python script as follows:
+
+  ----------------------------------------------------------------------------------
+  from multi_column import column_boxes
+
+  # for each page execute
+  bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
+
+  # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
+  # then x0 coordinates. Their text content can be extracted by all PyMuPDF
+  # get_text() variants, like for instance the following:
+  for rect in bboxes:
+      print(page.get_text(clip=rect, sort=True))
+  ----------------------------------------------------------------------------------
+"""
+import sys
+from magic_pdf.libs.commons import fitz
+
+
+def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
+    """Determine bboxes which wrap a column."""
+    paths = page.get_drawings()
+    bboxes = []
+
+    # path rectangles
+    path_rects = []
+
+    # image bboxes
+    img_bboxes = []
+
+    # bboxes of non-horizontal text
+    # avoid when expanding horizontal text boxes
+    vert_bboxes = []
+
+    # compute relevant page area
+    clip = +page.rect
+    clip.y1 -= footer_margin  # Remove footer area
+    clip.y0 += header_margin  # Remove header area
+
+    def can_extend(temp, bb, bboxlist):
+        """Determines whether rectangle 'temp' can be extended by 'bb'
+        without intersecting any of the rectangles contained in 'bboxlist'.
+
+        Items of bboxlist may be None if they have been removed.
+
+        Returns:
+            True if 'temp' has no intersections with items of 'bboxlist'.
+        """
+        for b in bboxlist:
+            if not intersects_bboxes(temp, vert_bboxes) and (
+                b == None or b == bb or (temp & b).is_empty
+            ):
+                continue
+            return False
+
+        return True
+
+    def in_bbox(bb, bboxes):
+        """Return 1-based number if a bbox contains bb, else return 0."""
+        for i, bbox in enumerate(bboxes):
+            if bb in bbox:
+                return i + 1
+        return 0
+
+    def intersects_bboxes(bb, bboxes):
+        """Return True if a bbox intersects bb, else return False."""
+        for bbox in bboxes:
+            if not (bb & bbox).is_empty:
+                return True
+        return False
+
+    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
+        """Extend a bbox to the right page border.
+
+        Whenever there is no text to the right of a bbox, enlarge it up
+        to the right page border.
+
+        Args:
+            bboxes: (list[IRect]) bboxes to check
+            width: (int) page width
+            path_bboxes: (list[IRect]) bboxes with a background color
+            vert_bboxes: (list[IRect]) bboxes with vertical text
+            img_bboxes: (list[IRect]) bboxes of images
+        Returns:
+            Potentially modified bboxes.
+        """
+        for i, bb in enumerate(bboxes):
+            # do not extend text with background color
+            if in_bbox(bb, path_bboxes):
+                continue
+
+            # do not extend text in images
+            if in_bbox(bb, img_bboxes):
+                continue
+
+            # temp extends bb to the right page border
+            temp = +bb
+            temp.x1 = width
+
+            # do not cut through colored background or images
+            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
+                continue
+
+            # also, do not intersect other text bboxes
+            check = can_extend(temp, bb, bboxes)
+            if check:
+                bboxes[i] = temp  # replace with enlarged bbox
+
+        return [b for b in bboxes if b != None]
+
+    def clean_nblocks(nblocks):
+        """Do some elementary cleaning."""
+
+        # 1. remove any duplicate blocks.
+        blen = len(nblocks)
+        if blen < 2:
+            return nblocks
+        start = blen - 1
+        for i in range(start, -1, -1):
+            bb1 = nblocks[i]
+            bb0 = nblocks[i - 1]
+            if bb0 == bb1:
+                del nblocks[i]
+
+        # 2. repair sequence in special cases:
+        # consecutive bboxes with almost same bottom value are sorted ascending
+        # by x-coordinate.
+        y1 = nblocks[0].y1  # first bottom coordinate
+        i0 = 0  # its index
+        i1 = -1  # index of last bbox with same bottom
+
+        # Iterate over bboxes, identifying segments with approx. same bottom value.
+        # Replace every segment by its sorted version.
+        for i in range(1, len(nblocks)):
+            b1 = nblocks[i]
+            if abs(b1.y1 - y1) > 10:  # different bottom
+                if i1 > i0:  # segment length > 1? Sort it!
+                    nblocks[i0 : i1 + 1] = sorted(
+                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
+                    )
+                y1 = b1.y1  # store new bottom value
+                i0 = i  # store its start index
+            i1 = i  # store current index
+        if i1 > i0:  # segment waiting to be sorted
+            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
+        return nblocks
+
+    # extract vector graphics
+    for p in paths:
+        path_rects.append(p["rect"].irect)
+    path_bboxes = path_rects
+
+    # sort path bboxes by ascending top, then left coordinates
+    path_bboxes.sort(key=lambda b: (b.y0, b.x0))
+
+    # bboxes of images on page, no need to sort them
+    for item in page.get_images():
+        img_bboxes.extend(page.get_image_rects(item[0]))
+
+    # blocks of text on page
+    blocks = page.get_text(
+        "dict",
+        flags=fitz.TEXTFLAGS_TEXT,
+        clip=clip,
+    )["blocks"]
+
+    # Make block rectangles, ignoring non-horizontal text
+    for b in blocks:
+        bbox = fitz.IRect(b["bbox"])  # bbox of the block
+
+        # ignore text written upon images
+        if no_image_text and in_bbox(bbox, img_bboxes):
+            continue
+
+        # confirm first line to be horizontal
+        line0 = b["lines"][0]  # get first line
+        if line0["dir"] != (1, 0):  # only accept horizontal text
+            vert_bboxes.append(bbox)
+            continue
+
+        srect = fitz.EMPTY_IRECT()
+        for line in b["lines"]:
+            lbbox = fitz.IRect(line["bbox"])
+            text = "".join([s["text"].strip() for s in line["spans"]])
+            if len(text) > 1:
+                srect |= lbbox
+        bbox = +srect
+
+        if not bbox.is_empty:
+            bboxes.append(bbox)
+
+    # Sort text bboxes by ascending background, top, then left coordinates
+    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
+
+    # Extend bboxes to the right where possible
+    bboxes = extend_right(
+        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
+    )
+
+    # immediately return of no text found
+    if bboxes == []:
+        return []
+
+    # --------------------------------------------------------------------
+    # Join bboxes to establish some column structure
+    # --------------------------------------------------------------------
+    # the final block bboxes on page
+    nblocks = [bboxes[0]]  # pre-fill with first bbox
+    bboxes = bboxes[1:]  # remaining old bboxes
+
+    for i, bb in enumerate(bboxes):  # iterate old bboxes
+        check = False  # indicates unwanted joins
+
+        # check if bb can extend one of the new blocks
+        for j in range(len(nblocks)):
+            nbb = nblocks[j]  # a new block
+
+            # never join across columns
+            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
+                continue
+
+            # never join across different background colors
+            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
+                continue
+
+            temp = bb | nbb  # temporary extension of new block
+            check = can_extend(temp, nbb, nblocks)
+            if check == True:
+                break
+
+        if not check:  # bb cannot be used to extend any of the new bboxes
+            nblocks.append(bb)  # so add it to the list
+            j = len(nblocks) - 1  # index of it
+            temp = nblocks[j]  # new bbox added
+
+        # check if some remaining bbox is contained in temp
+        check = can_extend(temp, bb, bboxes)
+        if check == False:
+            nblocks.append(bb)
+        else:
+            nblocks[j] = temp
+        bboxes[i] = None
+
+    # do some elementary cleaning
+    nblocks = clean_nblocks(nblocks)
+
+    # return identified text bboxes
+    return nblocks
+
+
+if __name__ == "__main__":
+    """Only for debugging purposes, currently.
+
+    Draw red borders around the returned text bboxes and insert
+    the bbox number.
+    Then save the file under the name "input-blocks.pdf".
+    """
+
+    # get the file name
+    filename = sys.argv[1]
+
+    # check if footer margin is given
+    if len(sys.argv) > 2:
+        footer_margin = int(sys.argv[2])
+    else:  # use default vaue
+        footer_margin = 50
+
+    # check if header margin is given
+    if len(sys.argv) > 3:
+        header_margin = int(sys.argv[3])
+    else:  # use default vaue
+        header_margin = 50
+
+    # open document
+    doc = fitz.open(filename)
+
+    # iterate over the pages
+    for page in doc:
+        # remove any geometry issues
+        page.wrap_contents()
+
+        # get the text bboxes
+        bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
+
+        # prepare a canvas to draw rectangles and text
+        shape = page.new_shape()
+
+        # iterate over the bboxes
+        for i, rect in enumerate(bboxes):
+            shape.draw_rect(rect)  # draw a border
+
+            # write sequence number
+            shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
+
+        # finish drawing / text with color red
+        shape.finish(color=fitz.pdfcolor["red"])
+        shape.commit()  # store to the page
+
+    # save document with text bboxes
+    doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))
\ No newline at end of file
diff --git a/magic_pdf/libs/Constants.py b/magic_pdf/libs/Constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0fa93282ec9b13cbfd72e013f5c618a96d243b
--- /dev/null
+++ b/magic_pdf/libs/Constants.py
@@ -0,0 +1,11 @@
+"""
+span维度自定义字段
+"""
+# span是否是跨页合并的
+CROSS_PAGE = "cross_page"
+
+"""
+block维度自定义字段
+"""
+# block中lines是否被删除
+LINES_DELETED = "lines_deleted"
\ No newline at end of file
diff --git a/magic_pdf/libs/MakeContentConfig.py b/magic_pdf/libs/MakeContentConfig.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1650affcfd3514a5a1d317e243dab58785ef452
--- /dev/null
+++ b/magic_pdf/libs/MakeContentConfig.py
@@ -0,0 +1,10 @@
+class MakeMode:
+    MM_MD = "mm_markdown"
+    NLP_MD = "nlp_markdown"
+    STANDARD_FORMAT = "standard_format"
+
+
+class DropMode:
+    WHOLE_PDF = "whole_pdf"
+    SINGLE_PAGE = "single_page"
+    NONE = "none"
diff --git a/magic_pdf/libs/ModelBlockTypeEnum.py b/magic_pdf/libs/ModelBlockTypeEnum.py
new file mode 100644
index 0000000000000000000000000000000000000000..494da6f6d7cd0b49dc7a08dcc54e7503c96eb41d
--- /dev/null
+++ b/magic_pdf/libs/ModelBlockTypeEnum.py
@@ -0,0 +1,9 @@
+from enum import Enum
+
+class ModelBlockTypeEnum(Enum):
+    TITLE = 0
+    PLAIN_TEXT = 1
+    ABANDON = 2
+    ISOLATE_FORMULA = 8
+    EMBEDDING = 13
+    ISOLATED = 14
\ No newline at end of file
diff --git a/magic_pdf/libs/__init__.py b/magic_pdf/libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4e34308ad1f3995198b32a377121c8eb73c6ad
--- /dev/null
+++ b/magic_pdf/libs/boxbase.py
@@ -0,0 +1,408 @@
+
+
+from loguru import logger
+import math
+
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠或者包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    return not (x1_1 < x0_2 or  # box1在box2的左边
+                x0_1 > x1_2 or  # box1在box2的右边
+                y1_1 < y0_2 or  # box1在box2的上边
+                y0_1 > y1_2)    # box1在box2的下边
+
+def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
+    """
+    判断box1是否在box2里面，或者box1和box2有部分重叠，且重叠面积占box1的比例超过area_ratio_threshold
+    
+    """
+    if box1 is None or box2 is None:
+        return False
+    
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    if not _is_in_or_part_overlap(box1, box2):
+        return False
+    
+    # 计算重叠面积
+    x_left = max(x0_1, x0_2)
+    y_top = max(y0_1, y0_2)
+    x_right = min(x1_1, x1_2)
+    y_bottom = min(y1_1, y1_2)
+    overlap_area = (x_right - x_left) * (y_bottom - y_top)
+    
+    # 计算box1的面积
+    box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
+    
+    return overlap_area / box1_area > area_ratio_threshold
+    
+    
+def _is_in(box1, box2) -> bool:
+    """
+    box1是否完全在box2里面
+    """
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
+            y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
+            x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
+            y1_1 <= y1_2)     # box1的下边界不在box2的下边外
+    
+def _is_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠，但不完全包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    
+    return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
+
+def _left_intersect(left_box, right_box):
+    "检查两个box的左边界是否有交集，也就是left_box的右边界是否在right_box的左边界内"
+    if left_box is None or right_box is None:
+        return False
+    
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+    
+    return x1_1>x0_2 and x0_1<x0_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
+
+def _right_intersect(left_box, right_box):
+    """
+    检查box是否在右侧边界有交集，也就是left_box的左边界是否在right_box的右边界内
+    """
+    if left_box is None or right_box is None:
+        return False
+    
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+    
+    return x0_1<x1_2 and x1_1>x1_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
+
+
+def _is_vertical_full_overlap(box1, box2, x_torlence=2):
+    """
+    x方向上：要么box1包含box2, 要么box2包含box1。不能部分包含
+    y方向上：box1和box2有重叠
+    """
+    # 解析box的坐标
+    x11, y11, x12, y12 = box1  # 左上角和右下角的坐标 (x1, y1, x2, y2)
+    x21, y21, x22, y22 = box2
+
+    # 在x轴方向上，box1是否包含box2 或 box2包含box1
+    contains_in_x = (x11-x_torlence <= x21 and x12+x_torlence >= x22) or (x21-x_torlence <= x11 and x22+x_torlence >= x12)
+
+    # 在y轴方向上，box1和box2是否有重叠
+    overlap_in_y = not (y12 < y21 or y11 > y22)
+
+    return contains_in_x and overlap_in_y
+    
+
+def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
+    """
+    检查box1下方和box2的上方有轻微的重叠，轻微程度收到y_tolerance的限制
+    这个函数和_is_vertical-full_overlap的区别是，这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度
+    """
+    if box1 is None or box2 is None:
+        return False
+    
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    tolerance_margin = 2
+    is_xdir_full_overlap = ((x0_1-tolerance_margin<=x0_2<=x1_1+tolerance_margin and x0_1-tolerance_margin<=x1_2<=x1_1+tolerance_margin) or (x0_2-tolerance_margin<=x0_1<=x1_2+tolerance_margin and x0_2-tolerance_margin<=x1_1<=x1_2+tolerance_margin))
+    
+    return y0_2<y1_1 and 0<(y1_1-y0_2)<y_tolerance and is_xdir_full_overlap
+
+def _is_left_overlap(box1, box2,):
+    """
+    检查box1的左侧是否和box2有重叠
+    在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系，也就是无论box1在box2下方还是box2在box1下方，都可以检测到重叠。
+    X方向上
+    """
+    def __overlap_y(Ay1, Ay2, By1, By2):
+        return max(0, min(Ay2, By2) - max(Ay1, By1))
+    
+    if box1 is None or box2 is None:
+        return False
+    
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    
+    y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
+    ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1-y0_1!=0 else 0
+    ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2-y0_2!=0 else 0
+    vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
+    
+    #vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
+    return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
+
+
+def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
+    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
+    _, y0_1, _, y1_1 = bbox1
+    _, y0_2, _, y1_2 = bbox2
+
+    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
+    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
+    max_height = max(height1, height2)
+    min_height = min(height1, height2)
+
+    return (overlap / min_height) > overlap_ratio_threshold
+
+
+
+def calculate_iou(bbox1, bbox2):
+    """
+    计算两个边界框的交并比(IOU)。
+
+    Args:
+        bbox1 (list[float]): 第一个边界框的坐标，格式为 [x1, y1, x2, y2]，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+        bbox2 (list[float]): 第二个边界框的坐标，格式与 `bbox1` 相同。
+
+    Returns:
+        float: 两个边界框的交并比(IOU)，取值范围为 [0, 1]。
+
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+
+    # The area of both rectangles
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+
+    # Compute the intersection over union by taking the intersection area 
+    # and dividing it by the sum of both areas minus the intersection area
+    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
+    return iou
+
+
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占最小面积的box的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = min([(bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]), (bbox2[3]-bbox2[1])*(bbox2[2]-bbox2[0])])
+    if min_box_area==0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+
+def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占bbox1的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
+    if bbox1_area == 0:
+        return 0
+    else:
+        return intersection_area / bbox1_area
+
+
+def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
+    """
+    通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
+    如果比例大于ratio，则返回小的那个bbox,
+    否则返回None
+    """
+    x1_min, y1_min, x1_max, y1_max = bbox1
+    x2_min, y2_min, x2_max, y2_max = bbox2
+    area1 = (x1_max - x1_min) * (y1_max - y1_min)
+    area2 = (x2_max - x2_min) * (y2_max - y2_min)
+    overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+    if overlap_ratio > ratio:
+        if area1 <= area2:
+            return bbox1
+        else:
+            return bbox2
+    else:
+        return None
+
+def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
+    x0, y0, x1, y1 = boundry
+    new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
+    return new_boxes
+
+
+def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
+    """
+    判断一个bbox是否在pdf页面的边缘
+    """
+    x0, x1 = bbox[0], bbox[2]
+    if x1<=width*side_threshold or x0>=width*(1-side_threshold):
+        return True
+    return False
+
+def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
+    tolerance_margin = 4
+    top_boxes = [box for box in pymu_blocks if obj_bbox[1]-box['bbox'][3] >=-tolerance_margin and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    top_boxes = [box for box in top_boxes if any([obj_bbox[0]-tolerance_margin <=box['bbox'][0]<=obj_bbox[2]+tolerance_margin, 
+                                                  obj_bbox[0]-tolerance_margin <=box['bbox'][2]<=obj_bbox[2]+tolerance_margin,
+                                                    box['bbox'][0]-tolerance_margin <=obj_bbox[0]<=box['bbox'][2]+tolerance_margin,
+                                                    box['bbox'][0]-tolerance_margin <=obj_bbox[2]<=box['bbox'][2]+tolerance_margin
+                                                  ])]
+    
+    # 然后找到y1最大的那个
+    if len(top_boxes)>0:
+        top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
+        return top_boxes[0]
+    else:
+        return None
+    
+
+def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
+    bottom_boxes = [box for box in pymu_blocks if box['bbox'][1] - obj_bbox[3]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    bottom_boxes = [box for box in bottom_boxes if any([obj_bbox[0]-2 <=box['bbox'][0]<=obj_bbox[2]+2, 
+                                                  obj_bbox[0]-2 <=box['bbox'][2]<=obj_bbox[2]+2,
+                                                    box['bbox'][0]-2 <=obj_bbox[0]<=box['bbox'][2]+2,
+                                                    box['bbox'][0]-2 <=obj_bbox[2]<=box['bbox'][2]+2
+                                                  ])]
+    
+    # 然后找到y0最小的那个
+    if len(bottom_boxes)>0:
+        bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
+        return bottom_boxes[0]
+    else:
+        return None
+
+def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """
+    寻找左侧最近的文本block
+    """
+    left_boxes = [box for box in pymu_blocks if obj_bbox[0]-box['bbox'][2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    left_boxes = [box for box in left_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2, 
+                                                  obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
+                                                  ])]
+    
+    # 然后找到x1最大的那个
+    if len(left_boxes)>0:
+        left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
+        return left_boxes[0]
+    else:
+        return None
+    
+
+def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """
+    寻找右侧最近的文本block
+    """
+    right_boxes = [box for box in pymu_blocks if box['bbox'][0]-obj_bbox[2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    right_boxes = [box for box in right_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2, 
+                                                  obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
+                                                  ])]
+    
+    # 然后找到x0最小的那个
+    if len(right_boxes)>0:
+        right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
+        return right_boxes[0]
+    else:
+        return None
+
+
+def bbox_relative_pos(bbox1, bbox2):
+    """
+    判断两个矩形框的相对位置关系
+
+    Args:
+        bbox1: 一个四元组，表示第一个矩形框的左上角和右下角的坐标，格式为(x1, y1, x1b, y1b)
+        bbox2: 一个四元组，表示第二个矩形框的左上角和右下角的坐标，格式为(x2, y2, x2b, y2b)
+
+    Returns:
+        一个四元组，表示矩形框1相对于矩形框2的位置关系，格式为(left, right, bottom, top)
+        其中，left表示矩形框1是否在矩形框2的左侧，right表示矩形框1是否在矩形框2的右侧，
+        bottom表示矩形框1是否在矩形框2的下方，top表示矩形框1是否在矩形框2的上方
+
+    """
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    
+    left = x2b < x1
+    right = x1b < x2
+    bottom = y2b < y1
+    top = y1b < y2
+    return left, right, bottom, top
+    
+def bbox_distance(bbox1, bbox2):
+    """
+    计算两个矩形框的距离。
+
+    Args:
+        bbox1 (tuple): 第一个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+        bbox2 (tuple): 第二个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+
+    Returns:
+        float: 矩形框之间的距离。
+
+    """
+    def dist(point1, point2):
+            return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
+    
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    
+    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+    
+    if top and left:
+        return dist((x1, y1b), (x2b, y2))
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b))
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b))
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2))
+    elif left:
+        return x1 - x2b
+    elif right:
+        return x2 - x1b
+    elif bottom:
+        return y1 - y2b
+    elif top:
+        return y2 - y1b
+    else:             # rectangles intersect
+        return 0
\ No newline at end of file
diff --git a/magic_pdf/libs/calc_span_stats.py b/magic_pdf/libs/calc_span_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0bf61a8d541c17d6c654614896edeaefa053a78
--- /dev/null
+++ b/magic_pdf/libs/calc_span_stats.py
@@ -0,0 +1,239 @@
+import os
+import csv
+import json
+import pandas as pd
+from pandas import DataFrame as df
+from matplotlib import pyplot as plt
+from termcolor import cprint
+
+"""
+Execute this script in the following way:
+
+1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
+
+    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
+    
+2. Under the directory code-clean, execute the following command:
+
+    $ python -m libs.calc_span_stats
+    
+"""
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+class SpanStatsCalc:
+    """Calculate statistics of span."""
+
+    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
+        """Draw multiple figures in one figure."""
+        # make a canvas
+        fig = plt.figure(fig_num, figsize=(20, 20))
+
+        pass
+
+    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
+        """Calculate statistics per pdf_dict."""
+        span_stats = pd.DataFrame()
+
+        span_stats = []
+        span_id = 0
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    for para_block in blocks["para_blocks"]:
+                        for line in para_block["lines"]:
+                            for span in line["spans"]:
+                                span_text = safe_get(span, "text", "")
+                                span_font_name = safe_get(span, "font", "")
+                                span_font_size = safe_get(span, "size", 0)
+                                span_font_color = safe_get(span, "color", "")
+                                span_font_flags = safe_get(span, "flags", 0)
+
+                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
+                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
+                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
+                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
+                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
+                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
+                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
+                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
+
+                                span_stats.append(
+                                    {
+                                        "span_id": span_id,  # id of span
+                                        "page_id": page_id,  # page number of pdf
+                                        "span_text": span_text,  # text of span
+                                        "span_font_name": span_font_name,  # font name of span
+                                        "span_font_size": span_font_size,  # font size of span
+                                        "span_font_color": span_font_color,  # font color of span
+                                        "span_font_flags": span_font_flags,  # font flags of span
+                                        "span_is_superscript": int(
+                                            span_is_super_script
+                                        ),  # indicate whether the span is super script or not
+                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
+                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
+                                        "span_is_sans_serifed": int(
+                                            span_is_sans_serifed
+                                        ),  # indicate whether the span is sans serifed or not
+                                        "span_is_monospaced": int(
+                                            span_is_monospaced
+                                        ),  # indicate whether the span is monospaced or not
+                                        "span_is_proportional": int(
+                                            span_is_proportional
+                                        ),  # indicate whether the span is proportional or not
+                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
+                                    }
+                                )
+
+                                span_id += 1
+
+        span_stats = pd.DataFrame(span_stats)
+        # print(span_stats)
+
+        return span_stats
+
+
+def __find_pdf_dic_files(
+    jf_name="pdf_dic.json",
+    base_code_name="code-clean",
+    tgt_base_dir_name="tmp",
+    unittest_dir_name="unittest",
+    md_dir_name="md",
+    book_names=[
+        "scihub",
+    ],  # other possible values: "zlib", "arxiv" and so on
+):
+    pdf_dict_files = []
+
+    curr_dir = os.path.dirname(__file__)
+
+    for i in range(len(curr_dir)):
+        if curr_dir[i : i + len(base_code_name)] == base_code_name:
+            base_code_dir_name = curr_dir[: i + len(base_code_name)]
+            for book_name in book_names:
+                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
+                if os.path.exists(base_code_dir_name):
+                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
+                    for root, dirs, files in os.walk(search_dir_name):
+                        for file in files:
+                            if file == jf_name:
+                                pdf_dict_files.append(os.path.join(root, file))
+                break
+
+    return pdf_dict_files
+
+
+def combine_span_texts(group_df, span_stats):
+    combined_span_texts = []
+    for _, row in group_df.iterrows():
+        curr_span_id = row.name
+        curr_span_text = row["span_text"]
+
+        pre_span_id = curr_span_id - 1
+        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
+
+        next_span_id = curr_span_id + 1
+        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
+
+        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
+        pointer_sign = "→ → → "
+        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
+        combined_span_texts.append(combined_text)
+
+    return "\n\n".join(combined_span_texts)
+
+
+# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
+pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
+
+
+def main():
+    pdf_dict_files = __find_pdf_dic_files()
+    # print(pdf_dict_files)
+
+    span_stats_calc = SpanStatsCalc()
+
+    for pdf_dict_file in pdf_dict_files:
+        print("-" * 100)
+        print_green_on_red(f"Processing {pdf_dict_file}")
+
+        with open(pdf_dict_file, "r", encoding="utf-8") as f:
+            pdf_dict = json.load(f)
+
+            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
+            raw_df.to_csv(save_path, index=False)
+
+            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
+            if filtered_df.empty:
+                print("No superscript span found!")
+                continue
+
+            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+
+            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
+
+            final_df = filtered_grouped_df.size().reset_index(name="count")
+            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
+
+            print(final_df)
+
+            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
+
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
+            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
+            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
+
+            # 创建一个 2x2 的图表布局
+            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
+
+            # 按照 span_font_name 分类作图
+            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
+
+            # 按照 span_font_size 分类作图
+            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
+
+            # 按照 span_font_color 分类作图
+            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
+
+            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
+            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
+
+            # 调整布局
+            plt.tight_layout()
+
+            # 显示图表
+            # plt.show()
+
+            # 保存图表到 PNG 文件
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
+            plt.savefig(save_path)
+
+            # 清除画布
+            plt.clf()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/magic_pdf/libs/commons.py b/magic_pdf/libs/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..15592dbfe7b989846e2e0804c4ebfbda6549cf47
--- /dev/null
+++ b/magic_pdf/libs/commons.py
@@ -0,0 +1,204 @@
+import datetime
+import json
+import os, re, configparser
+import subprocess
+import time
+
+import boto3
+from loguru import logger
+from boto3.s3.transfer import TransferConfig
+from botocore.config import Config
+
+import fitz # 1.23.9中已经切换到rebase
+# import fitz_old as fitz  # 使用1.23.9之前的pymupdf库
+
+
+def get_delta_time(input_time):
+    return round(time.time() - input_time, 2)
+
+
+def join_path(*args):
+    return '/'.join(str(s).rstrip('/') for s in args)
+
+
+#配置全局的errlog_path，方便demo同步引用
+error_log_path = "s3://llm-pdf-text/err_logs/"
+# json_dump_path = "s3://pdf_books_temp/json_dump/" # 这条路径仅用于临时本地测试,不能提交到main
+json_dump_path = "s3://llm-pdf-text/json_dump/"
+
+# s3_image_save_path = "s3://mllm-raw-media/pdf2md_img/" # 基础库不应该有这些存在的路径，应该在业务代码中定义
+
+
+def get_top_percent_list(num_list, percent):
+    """
+    获取列表中前百分之多少的元素
+    :param num_list:
+    :param percent:
+    :return:
+    """
+    if len(num_list) == 0:
+        top_percent_list = []
+    else:
+        # 对imgs_len_list排序
+        sorted_imgs_len_list = sorted(num_list, reverse=True)
+        # 计算 percent 的索引
+        top_percent_index = int(len(sorted_imgs_len_list) * percent)
+        # 取前80%的元素
+        top_percent_list = sorted_imgs_len_list[:top_percent_index]
+    return top_percent_list
+
+
+def formatted_time(time_stamp):
+    dt_object = datetime.datetime.fromtimestamp(time_stamp)
+    output_time = dt_object.strftime("%Y-%m-%d-%H:%M:%S")
+    return output_time
+
+
+def mymax(alist: list):
+    if len(alist) == 0:
+        return 0  # 空是0， 0*0也是0大小q
+    else:
+        return max(alist)
+
+def parse_aws_param(profile):
+    if isinstance(profile, str):
+        # 解析配置文件
+        config_file = join_path(os.path.expanduser("~"), ".aws", "config")
+        credentials_file = join_path(os.path.expanduser("~"), ".aws", "credentials")
+        config = configparser.ConfigParser()
+        config.read(credentials_file)
+        config.read(config_file)
+        # 获取 AWS 账户相关信息
+        ak = config.get(profile, "aws_access_key_id")
+        sk = config.get(profile, "aws_secret_access_key")
+        if profile == "default":
+            s3_str = config.get(f"{profile}", "s3")
+        else:
+            s3_str = config.get(f"profile {profile}", "s3")
+        end_match = re.search("endpoint_url[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if end_match:
+            endpoint = end_match.group(1)
+        else:
+            raise ValueError(f"aws 配置文件中没有找到 endpoint_url")
+        style_match = re.search("addressing_style[\s]*=[\s]*([^\s\n]+)[\s\n]*$", s3_str, re.MULTILINE)
+        if style_match:
+            addressing_style = style_match.group(1)
+        else:
+            addressing_style = "path"
+    elif isinstance(profile, dict):
+        ak = profile["ak"]
+        sk = profile["sk"]
+        endpoint = profile["endpoint"]
+        addressing_style = "auto"
+
+    return ak, sk, endpoint, addressing_style
+
+
+def parse_bucket_key(s3_full_path: str):
+    """
+    输入 s3://bucket/path/to/my/file.txt
+    输出 bucket, path/to/my/file.txt
+    """
+    s3_full_path = s3_full_path.strip()
+    if s3_full_path.startswith("s3://"):
+        s3_full_path = s3_full_path[5:]
+    if s3_full_path.startswith("/"):
+        s3_full_path = s3_full_path[1:]
+    bucket, key = s3_full_path.split("/", 1)
+    return bucket, key
+
+
+def read_file(pdf_path: str, s3_profile):
+    if pdf_path.startswith("s3://"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                           config=Config(s3={'addressing_style': addressing_style}, retries={'max_attempts': 10, 'mode': 'standard'}))
+        bucket_name, bucket_key = parse_bucket_key(pdf_path)
+        res = cli.get_object(Bucket=bucket_name, Key=bucket_key)
+        file_content = res["Body"].read()
+        return file_content
+    else:
+        with open(pdf_path, "rb") as f:
+            return f.read()
+
+
+def get_docx_model_output(pdf_model_output, page_id):
+
+    model_output_json = pdf_model_output[page_id]
+
+    return model_output_json
+
+
+def list_dir(dir_path:str, s3_profile:str):
+    """
+    列出dir_path下的所有文件
+    """
+    ret = []
+    
+    if dir_path.startswith("s3"):
+        ak, sk, end_point, addressing_style = parse_aws_param(s3_profile)
+        s3info = re.findall(r"s3:\/\/([^\/]+)\/(.*)", dir_path)
+        bucket, path = s3info[0][0], s3info[0][1]
+        try:
+            cli = boto3.client(service_name="s3", aws_access_key_id=ak, aws_secret_access_key=sk, endpoint_url=end_point,
+                                            config=Config(s3={'addressing_style': addressing_style}))
+            def list_obj_scluster():
+                marker = None
+                while True:
+                    list_kwargs = dict(MaxKeys=1000, Bucket=bucket, Prefix=path)
+                    if marker:
+                        list_kwargs['Marker'] = marker
+                    response = cli.list_objects(**list_kwargs)
+                    contents = response.get("Contents", [])
+                    yield from contents
+                    if not response.get("IsTruncated") or len(contents)==0:
+                        break
+                    marker = contents[-1]['Key']
+
+
+            for info in list_obj_scluster():
+                file_path = info['Key']
+                #size = info['Size']
+
+                if path!="":
+                    afile = file_path[len(path):]
+                    if afile.endswith(".json"):
+                        ret.append(f"s3://{bucket}/{file_path}")
+                        
+            return ret
+
+        except Exception as e:
+            logger.exception(e)
+            exit(-1)
+    else: #本地的目录，那么扫描本地目录并返会这个目录里的所有jsonl文件
+        
+        for root, dirs, files in os.walk(dir_path):
+            for file in files:
+                if file.endswith(".json"):
+                    ret.append(join_path(root, file))
+        ret.sort()
+        return ret
+
+def get_img_s3_client(save_path:str, image_s3_config:str):
+    """
+    """
+    if save_path.startswith("s3://"):  # 放这里是为了最少创建一个s3 client
+        ak, sk, end_point, addressing_style = parse_aws_param(image_s3_config)
+        img_s3_client = boto3.client(
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=end_point,
+            config=Config(s3={"addressing_style": addressing_style}, retries={'max_attempts': 5, 'mode': 'standard'}),
+        )
+    else:
+        img_s3_client = None
+        
+    return img_s3_client
+
+if __name__=="__main__":
+    s3_path = "s3://llm-pdf-text/layout_det/scihub/scimag07865000-07865999/10.1007/s10729-011-9175-6.pdf/"
+    s3_profile = "langchao"
+    ret = list_dir(s3_path, s3_profile)
+    print(ret)
+    
\ No newline at end of file
diff --git a/magic_pdf/libs/config_reader.py b/magic_pdf/libs/config_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2a3663f3206883c9af66b3213531e127ccc1826
--- /dev/null
+++ b/magic_pdf/libs/config_reader.py
@@ -0,0 +1,73 @@
+"""
+根据bucket的名字返回对应的s3 AK， SK，endpoint三元组
+
+"""
+
+import json
+import os
+
+from loguru import logger
+
+from magic_pdf.libs.commons import parse_bucket_key
+
+
+def read_config():
+    home_dir = os.path.expanduser("~")
+
+    config_file = os.path.join(home_dir, "magic-pdf.json")
+
+    if not os.path.exists(config_file):
+        raise Exception(f"{config_file} not found")
+
+    with open(config_file, "r") as f:
+        config = json.load(f)
+    return config
+
+
+def get_s3_config(bucket_name: str):
+    """
+    ~/magic-pdf.json 读出来
+    """
+    config = read_config()
+
+    bucket_info = config.get("bucket_info")
+    if bucket_name not in bucket_info:
+        access_key, secret_key, storage_endpoint = bucket_info["[default]"]
+    else:
+        access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
+
+    if access_key is None or secret_key is None or storage_endpoint is None:
+        raise Exception("ak, sk or endpoint not found in magic-pdf.json")
+
+    # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
+
+    return access_key, secret_key, storage_endpoint
+
+
+def get_s3_config_dict(path: str):
+    access_key, secret_key, storage_endpoint = get_s3_config(get_bucket_name(path))
+    return {"ak": access_key, "sk": secret_key, "endpoint": storage_endpoint}
+
+
+def get_bucket_name(path):
+    bucket, key = parse_bucket_key(path)
+    return bucket
+
+
+def get_local_dir():
+    config = read_config()
+    return config.get("temp-output-dir", "/tmp")
+
+
+def get_local_models_dir():
+    config = read_config()
+    return config.get("models-dir", "/tmp/models")
+
+
+def get_device():
+    config = read_config()
+    return config.get("device-mode", "cpu")
+
+
+if __name__ == "__main__":
+    ak, sk, endpoint = get_s3_config("llm-raw")
diff --git a/magic_pdf/libs/convert_utils.py b/magic_pdf/libs/convert_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..99a1879d46befa2de63aa1a379ab83dbf6fdb1f1
--- /dev/null
+++ b/magic_pdf/libs/convert_utils.py
@@ -0,0 +1,5 @@
+def dict_to_list(input_dict):
+    items_list = []
+    for _, item in input_dict.items():
+        items_list.append(item)
+    return items_list
diff --git a/magic_pdf/libs/coordinate_transform.py b/magic_pdf/libs/coordinate_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cd7a0768596174d71ea8b3c8309c0ec998b3c81
--- /dev/null
+++ b/magic_pdf/libs/coordinate_transform.py
@@ -0,0 +1,9 @@
+def get_scale_ratio(model_page_info, page):
+    pix = page.get_pixmap(dpi=72)
+    pymu_width = int(pix.w)
+    pymu_height = int(pix.h)
+    width_from_json = model_page_info['page_info']['width']
+    height_from_json = model_page_info['page_info']['height']
+    horizontal_scale_ratio = width_from_json / pymu_width
+    vertical_scale_ratio = height_from_json / pymu_height
+    return horizontal_scale_ratio, vertical_scale_ratio
diff --git a/magic_pdf/libs/detect_language_from_model.py b/magic_pdf/libs/detect_language_from_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..55abf10584b5a3cfd43643ad02c145642a6f2a78
--- /dev/null
+++ b/magic_pdf/libs/detect_language_from_model.py
@@ -0,0 +1,21 @@
+from collections import Counter
+
+from magic_pdf.libs.language import detect_lang
+
+def get_language_from_model(model_list: list):
+    language_lst = []
+    for ocr_page_info in model_list:
+        page_text = ""
+        layout_dets = ocr_page_info["layout_dets"]
+        for layout_det in layout_dets:
+            category_id = layout_det["category_id"]
+            allow_category_id_list = [15]
+            if category_id in allow_category_id_list:
+                page_text += layout_det["text"]
+        page_language = detect_lang(page_text)
+        language_lst.append(page_language)
+    # 统计text_language_list中每种语言的个数
+    count_dict = Counter(language_lst)
+    # 输出text_language_list中出现的次数最多的语言
+    language = max(count_dict, key=count_dict.get)
+    return language
diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..65b7815232c4a2d03bf57959fdedbecb3230ee0d
--- /dev/null
+++ b/magic_pdf/libs/draw_bbox.py
@@ -0,0 +1,227 @@
+from magic_pdf.libs.Constants import CROSS_PAGE
+from magic_pdf.libs.commons import fitz  # PyMuPDF
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+
+
+def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for bbox in page_data:
+        x0, y0, x1, y1 = bbox
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        if fill_config:
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        else:
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+
+
+def draw_bbox_with_number(i, bbox_list, page, rgb_config, fill_config):
+    new_rgb = []
+    for item in rgb_config:
+        item = float(item) / 255
+        new_rgb.append(item)
+    page_data = bbox_list[i]
+    for j, bbox in enumerate(page_data):
+        x0, y0, x1, y1 = bbox
+        rect_coords = fitz.Rect(x0, y0, x1, y1)  # Define the rectangle
+        if fill_config:
+            page.draw_rect(
+                rect_coords,
+                color=None,
+                fill=new_rgb,
+                fill_opacity=0.3,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        else:
+            page.draw_rect(
+                rect_coords,
+                color=new_rgb,
+                fill=None,
+                fill_opacity=1,
+                width=0.5,
+                overlay=True,
+            )  # Draw the rectangle
+        page.insert_text(
+            (x0, y0 + 10), str(j + 1), fontsize=10, color=new_rgb
+        )  # Insert the index in the top left corner of the rectangle
+
+
+def draw_layout_bbox(pdf_info, pdf_bytes, out_path):
+    layout_bbox_list = []
+    dropped_bbox_list = []
+    tables_list, tables_body_list, tables_caption_list, tables_footnote_list = [], [], [], []
+    imgs_list, imgs_body_list, imgs_caption_list = [], [], []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    for page in pdf_info:
+        page_layout_list = []
+        page_dropped_list = []
+        tables, tables_body, tables_caption, tables_footnote = [], [], [], []
+        imgs, imgs_body, imgs_caption = [], [], []
+        titles = []
+        texts = []
+        interequations = []
+        for layout in page["layout_bboxes"]:
+            page_layout_list.append(layout["layout_bbox"])
+        layout_bbox_list.append(page_layout_list)
+        for dropped_bbox in page["discarded_blocks"]:
+            page_dropped_list.append(dropped_bbox["bbox"])
+        dropped_bbox_list.append(page_dropped_list)
+        for block in page["para_blocks"]:
+            bbox = block["bbox"]
+            if block["type"] == BlockType.Table:
+                tables.append(bbox)
+                for nested_block in block["blocks"]:
+                    bbox = nested_block["bbox"]
+                    if nested_block["type"] == BlockType.TableBody:
+                        tables_body.append(bbox)
+                    elif nested_block["type"] == BlockType.TableCaption:
+                        tables_caption.append(bbox)
+                    elif nested_block["type"] == BlockType.TableFootnote:
+                        tables_footnote.append(bbox)
+            elif block["type"] == BlockType.Image:
+                imgs.append(bbox)
+                for nested_block in block["blocks"]:
+                    bbox = nested_block["bbox"]
+                    if nested_block["type"] == BlockType.ImageBody:
+                        imgs_body.append(bbox)
+                    elif nested_block["type"] == BlockType.ImageCaption:
+                        imgs_caption.append(bbox)
+            elif block["type"] == BlockType.Title:
+                titles.append(bbox)
+            elif block["type"] == BlockType.Text:
+                texts.append(bbox)
+            elif block["type"] == BlockType.InterlineEquation:
+                interequations.append(bbox)
+        tables_list.append(tables)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_list.append(imgs)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0], False)
+        draw_bbox_without_number(i, dropped_bbox_list, page, [158, 158, 158], True)
+        draw_bbox_without_number(i, tables_list, page, [153, 153, 0], True)  # color !
+        draw_bbox_without_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_without_number(i, tables_caption_list, page, [255, 255, 102], True)
+        draw_bbox_without_number(i, tables_footnote_list, page, [229, 255, 204], True)
+        draw_bbox_without_number(i, imgs_list, page, [51, 102, 0], True)
+        draw_bbox_without_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_without_number(i, imgs_caption_list, page, [102, 178, 255], True)
+        draw_bbox_without_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_without_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_without_number(i, interequations_list, page, [0, 255, 0], True)
+
+    # Save the PDF
+    pdf_docs.save(f"{out_path}/layout.pdf")
+
+
+def draw_span_bbox(pdf_info, pdf_bytes, out_path):
+    text_list = []
+    inline_equation_list = []
+    interline_equation_list = []
+    image_list = []
+    table_list = []
+    dropped_list = []
+    next_page_text_list = []
+    next_page_inline_equation_list = []
+
+    def get_span_info(span):
+        if span["type"] == ContentType.Text:
+            if span.get(CROSS_PAGE, False):
+                next_page_text_list.append(span["bbox"])
+            else:
+                page_text_list.append(span["bbox"])
+        elif span["type"] == ContentType.InlineEquation:
+            if span.get(CROSS_PAGE, False):
+                next_page_inline_equation_list.append(span["bbox"])
+            else:
+                page_inline_equation_list.append(span["bbox"])
+        elif span["type"] == ContentType.InterlineEquation:
+            page_interline_equation_list.append(span["bbox"])
+        elif span["type"] == ContentType.Image:
+            page_image_list.append(span["bbox"])
+        elif span["type"] == ContentType.Table:
+            page_table_list.append(span["bbox"])
+
+    for page in pdf_info:
+        page_text_list = []
+        page_inline_equation_list = []
+        page_interline_equation_list = []
+        page_image_list = []
+        page_table_list = []
+        page_dropped_list = []
+
+        # 将跨页的span放到移动到下一页的列表中
+        if len(next_page_text_list) > 0:
+            page_text_list.extend(next_page_text_list)
+            next_page_text_list.clear()
+        if len(next_page_inline_equation_list) > 0:
+            page_inline_equation_list.extend(next_page_inline_equation_list)
+            next_page_inline_equation_list.clear()
+
+        # 构造dropped_list
+        for block in page["discarded_blocks"]:
+            if block["type"] == BlockType.Discarded:
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        page_dropped_list.append(span["bbox"])
+        dropped_list.append(page_dropped_list)
+        # 构造其余useful_list
+        for block in page["para_blocks"]:
+            if block["type"] in [
+                BlockType.Text,
+                BlockType.Title,
+                BlockType.InterlineEquation,
+            ]:
+                for line in block["lines"]:
+                    for span in line["spans"]:
+                        get_span_info(span)
+            elif block["type"] in [BlockType.Image, BlockType.Table]:
+                for sub_block in block["blocks"]:
+                    for line in sub_block["lines"]:
+                        for span in line["spans"]:
+                            get_span_info(span)
+        text_list.append(page_text_list)
+        inline_equation_list.append(page_inline_equation_list)
+        interline_equation_list.append(page_interline_equation_list)
+        image_list.append(page_image_list)
+        table_list.append(page_table_list)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    for i, page in enumerate(pdf_docs):
+        # 获取当前页面的数据
+        draw_bbox_without_number(i, text_list, page, [255, 0, 0], False)
+        draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0], False)
+        draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255], False)
+        draw_bbox_without_number(i, image_list, page, [255, 204, 0], False)
+        draw_bbox_without_number(i, table_list, page, [204, 0, 255], False)
+        draw_bbox_without_number(i, dropped_list, page, [158, 158, 158], False)
+
+    # Save the PDF
+    pdf_docs.save(f"{out_path}/spans.pdf")
diff --git a/magic_pdf/libs/drop_reason.py b/magic_pdf/libs/drop_reason.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f574fc2a5e0c6025448ed10fc604172cbb4357f
--- /dev/null
+++ b/magic_pdf/libs/drop_reason.py
@@ -0,0 +1,27 @@
+
+class DropReason:
+    TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖，导致无法准确定位文字顺序
+    USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
+    COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局，暂时不支持
+    TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
+    COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF，色块会改变阅读顺序，目前不支持带底色文字块的PDF。
+    HIGH_COMPUTATIONAL_lOAD_BY_IMGS = "high_computational_load_by_imgs" # 含特殊图片，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_SVGS = "high_computational_load_by_svgs" # 特殊的SVG图，计算量太大，从而丢弃
+    HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = "high_computational_load_by_total_pages" # 计算量超过负荷，当前方法下计算量消耗过大
+    MISS_DOC_LAYOUT_RESULT = "missing doc_layout_result" # 版面分析失败
+    Exception = "_exception" # 解析中发生异常
+    ENCRYPTED = "encrypted" # PDF是加密的
+    EMPTY_PDF = "total_page=0" # PDF页面总数为0
+    NOT_IS_TEXT_PDF = "not_is_text_pdf" # 不是文字版PDF，无法直接解析
+    DENSE_SINGLE_LINE_BLOCK = "dense_single_line_block" # 无法清晰的分段
+    TITLE_DETECTION_FAILED = "title_detection_failed" # 探测标题失败
+    TITLE_LEVEL_FAILED = "title_level_failed" # 分析标题级别失败（例如一级、二级、三级标题）
+    PARA_SPLIT_FAILED = "para_split_failed" # 识别段落失败
+    PARA_MERGE_FAILED = "para_merge_failed" # 段落合并失败
+    NOT_ALLOW_LANGUAGE = "not_allow_language" # 不支持的语种
+    SPECIAL_PDF = "special_pdf"
+    PSEUDO_SINGLE_COLUMN = "pseudo_single_column" # 无法精确判断文字分栏
+    CAN_NOT_DETECT_PAGE_LAYOUT="can_not_detect_page_layout" # 无法分析页面的版面
+    NEGATIVE_BBOX_AREA = "negative_bbox_area" # 缩放导致 bbox 面积为负
+    OVERLAP_BLOCKS_CAN_NOT_SEPARATION = "overlap_blocks_can_t_separation" # 无法分离重叠的block
+    
\ No newline at end of file
diff --git a/magic_pdf/libs/drop_tag.py b/magic_pdf/libs/drop_tag.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1c577174abff03322c06de89c63abe8130e9d5c
--- /dev/null
+++ b/magic_pdf/libs/drop_tag.py
@@ -0,0 +1,19 @@
+
+COLOR_BG_HEADER_TXT_BLOCK = "color_background_header_txt_block"
+PAGE_NO = "page-no" # 页码
+CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
+VERTICAL_TEXT = 'vertical-text' # 垂直文本
+ROTATE_TEXT = 'rotate-text' # 旋转文本
+EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
+ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
+ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
+
+
+class DropTag:
+    PAGE_NUMBER = "page_no"
+    HEADER = "header"
+    FOOTER = "footer"
+    FOOTNOTE = "footnote"
+    NOT_IN_LAYOUT = "not_in_layout"
+    SPAN_OVERLAP = "span_overlap"
+    BLOCK_OVERLAP = "block_overlap"
diff --git a/magic_pdf/libs/hash_utils.py b/magic_pdf/libs/hash_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..47b8aea746eb04eeb427b775227692ef6b4d9d29
--- /dev/null
+++ b/magic_pdf/libs/hash_utils.py
@@ -0,0 +1,15 @@
+import hashlib
+
+
+def compute_md5(file_bytes):
+    hasher = hashlib.md5()
+    hasher.update(file_bytes)
+    return hasher.hexdigest().upper()
+
+
+def compute_sha256(input_string):
+    hasher = hashlib.sha256()
+    # 在Python3中，需要将字符串转化为字节对象才能被哈希函数处理
+    input_bytes = input_string.encode('utf-8')
+    hasher.update(input_bytes)
+    return hasher.hexdigest()
diff --git a/magic_pdf/libs/json_compressor.py b/magic_pdf/libs/json_compressor.py
new file mode 100644
index 0000000000000000000000000000000000000000..77ef1c876fcae0b34a42355b3edb079bb5dd891b
--- /dev/null
+++ b/magic_pdf/libs/json_compressor.py
@@ -0,0 +1,27 @@
+import json
+import brotli
+import base64
+
+class JsonCompressor:
+
+    @staticmethod
+    def compress_json(data):
+        """
+        Compress a json object and encode it with base64
+        """
+        json_str = json.dumps(data)
+        json_bytes = json_str.encode('utf-8')
+        compressed = brotli.compress(json_bytes, quality=6)
+        compressed_str = base64.b64encode(compressed).decode('utf-8')  # convert bytes to string
+        return compressed_str
+
+    @staticmethod
+    def decompress_json(compressed_str):
+        """
+        Decode the base64 string and decompress the json object
+        """
+        compressed = base64.b64decode(compressed_str.encode('utf-8'))  # convert string to bytes
+        decompressed_bytes = brotli.decompress(compressed)
+        json_str = decompressed_bytes.decode('utf-8')
+        data = json.loads(json_str)
+        return data
diff --git a/magic_pdf/libs/language.py b/magic_pdf/libs/language.py
new file mode 100644
index 0000000000000000000000000000000000000000..bddb5475261ffe371a13e6d4b71cefea2186c1ad
--- /dev/null
+++ b/magic_pdf/libs/language.py
@@ -0,0 +1,24 @@
+import unicodedata
+from fast_langdetect import detect_language
+
+
+def detect_lang(text: str) -> str:
+    if len(text) == 0:
+        return ""
+    try:
+        lang_upper = detect_language(text)
+    except:
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_language(html_no_ctrl_chars)
+    try:
+        lang = lang_upper.lower()
+    except:
+        lang = ""
+    return lang
+
+
+if __name__ == '__main__':
+    print(detect_lang("This is a test."))
+    print(detect_lang("<html>This is a test</html>"))
+    print(detect_lang("这个是中文测试。"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
diff --git a/magic_pdf/libs/markdown_utils.py b/magic_pdf/libs/markdown_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5708b477e2cad08805155ffe7930281e77517cf3
--- /dev/null
+++ b/magic_pdf/libs/markdown_utils.py
@@ -0,0 +1,31 @@
+import re
+
+
+def escape_special_markdown_char(pymu_blocks):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for blk in pymu_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                for char in special_chars:
+                    span_text = span['text']
+                    span_type = span.get("_type", None)
+                    if span_type in ['inline-equation', 'interline-equation']:
+                        continue
+                    elif span_text:
+                        span['text'] = span['text'].replace(char, "\\" + char)
+
+    return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+
+    return content
diff --git a/magic_pdf/libs/math.py b/magic_pdf/libs/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..9edbcc7074dfa189a8508eb76366ae31dba4d665
--- /dev/null
+++ b/magic_pdf/libs/math.py
@@ -0,0 +1,9 @@
+def float_gt(a, b):
+    if 0.0001 >= abs(a -b):
+        return False
+    return a > b
+    
+def float_equal(a, b):
+    if 0.0001 >= abs(a-b):
+        return True
+    return False
\ No newline at end of file
diff --git a/magic_pdf/libs/nlp_utils.py b/magic_pdf/libs/nlp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..49a7365b32968690f8a199647cdf266a476c233b
--- /dev/null
+++ b/magic_pdf/libs/nlp_utils.py
@@ -0,0 +1,203 @@
+import re
+from os import path
+
+from collections import Counter
+
+from loguru import logger
+
+# from langdetect import detect
+import spacy
+import en_core_web_sm
+import zh_core_web_sm
+
+from magic_pdf.libs.language import detect_lang
+
+
+class NLPModels:
+    """
+    How to upload local models to s3:
+        - config aws cli:
+            doc\SETUP-CLI.md
+            doc\setup_cli.sh
+            app\config\__init__.py
+        - $ cd {local_dir_storing_models}
+        - $ ls models
+            en_core_web_sm-3.7.1/
+            zh_core_web_sm-3.7.0/
+        - $ aws s3 sync models/ s3://llm-infra/models --profile=p_project_norm
+        - $ aws s3 --profile=p_project_norm ls  s3://llm-infra/models/
+            PRE en_core_web_sm-3.7.1/
+            PRE zh_core_web_sm-3.7.0/
+    """
+
+    def __init__(self):
+        # if OS is windows, set "TMP_DIR" to "D:/tmp"
+
+        home_dir = path.expanduser("~")
+        self.default_local_path = path.join(home_dir, ".nlp_models")
+        self.default_shared_path = "/share/pdf_processor/nlp_models"
+        self.default_hdfs_path = "hdfs://pdf_processor/nlp_models"
+        self.default_s3_path = "s3://llm-infra/models"
+        self.nlp_models = self.nlp_models = {
+            "en_core_web_sm": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "en_core_web_md": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "en_core_web_lg": {
+                "type": "spacy",
+                "version": "3.7.1",
+            },
+            "zh_core_web_sm": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+            "zh_core_web_md": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+            "zh_core_web_lg": {
+                "type": "spacy",
+                "version": "3.7.0",
+            },
+        }
+        self.en_core_web_sm_model = en_core_web_sm.load()
+        self.zh_core_web_sm_model = zh_core_web_sm.load()
+
+    def load_model(self, model_name, model_type, model_version):
+        if (
+            model_name in self.nlp_models
+            and self.nlp_models[model_name]["type"] == model_type
+            and self.nlp_models[model_name]["version"] == model_version
+        ):
+            return spacy.load(model_name) if spacy.util.is_package(model_name) else None
+
+        else:
+            logger.error(f"Unsupported model name or version: {model_name} {model_version}")
+            return None
+
+    def detect_language(self, text, use_langdetect=False):
+        if len(text) == 0:
+            return None
+        if use_langdetect:
+            # print("use_langdetect")
+            # print(detect_lang(text))
+            # return detect_lang(text)
+            if detect_lang(text) == "zh":
+                return "zh"
+            else:
+                return "en"
+
+        if not use_langdetect:
+            en_count = len(re.findall(r"[a-zA-Z]", text))
+            cn_count = len(re.findall(r"[\u4e00-\u9fff]", text))
+
+            if en_count > cn_count:
+                return "en"
+
+            if cn_count > en_count:
+                return "zh"
+
+    def detect_entity_catgr_using_nlp(self, text, threshold=0.5):
+        """
+        Detect entity categories using NLP models and return the most frequent entity types.
+
+        Parameters
+        ----------
+        text : str
+            Text to be processed.
+
+        Returns
+        -------
+        str
+            The most frequent entity type.
+        """
+        lang = self.detect_language(text, use_langdetect=True)
+
+        if lang == "en":
+            nlp_model = self.en_core_web_sm_model
+        elif lang == "zh":
+            nlp_model = self.zh_core_web_sm_model
+        else:
+            # logger.error(f"Unsupported language: {lang}")
+            return {}
+
+        # Splitting text into smaller parts
+        text_parts = re.split(r"[,;，；、\s & |]+", text)
+
+        text_parts = [part for part in text_parts if not re.match(r"[\d\W]+", part)]  # Remove non-words
+        text_combined = " ".join(text_parts)
+
+        try:
+            doc = nlp_model(text_combined)
+            entity_counts = Counter([ent.label_ for ent in doc.ents])
+            word_counts_in_entities = Counter()
+
+            for ent in doc.ents:
+                word_counts_in_entities[ent.label_] += len(ent.text.split())
+
+            total_words_in_entities = sum(word_counts_in_entities.values())
+            total_words = len([token for token in doc if not token.is_punct])
+
+            if total_words_in_entities == 0 or total_words == 0:
+                return None
+
+            entity_percentage = total_words_in_entities / total_words
+            if entity_percentage < 0.5:
+                return None
+
+            most_common_entity, word_count = word_counts_in_entities.most_common(1)[0]
+            entity_percentage = word_count / total_words_in_entities
+
+            if entity_percentage >= threshold:
+                return most_common_entity
+            else:
+                return None
+        except Exception as e:
+            logger.error(f"Error in entity detection: {e}")
+            return None
+
+
+def __main__():
+    nlpModel = NLPModels()
+
+    test_strings = [
+        "张三",
+        "张三, 李四，王五; 赵六",
+        "John Doe",
+        "Jane Smith",
+        "Lee, John",
+        "John Doe, Jane Smith; Alice Johnson，Bob Lee",
+        "孙七, Michael Jordan；赵八",
+        "David Smith  Michael O'Connor; Kevin ßáçøñ",
+        "李雷·韩梅梅, 张三·李四",
+        "Charles Robert Darwin, Isaac Newton",
+        "莱昂纳多·迪卡普里奥, 杰克·吉伦哈尔",
+        "John Doe, Jane Smith; Alice Johnson",
+        "张三, 李四，王五; 赵六",
+        "Lei Wang, Jia Li, and Xiaojun Chen, LINKE YANG OU, and YUAN ZHANG",
+        "Rachel Mills  &  William Barry  &  Susanne B. Haga",
+        "Claire Chabut* and Jean-François Bussières",
+        "1 Department of Chemistry, Northeastern University, Shenyang 110004, China 2 State Key Laboratory of Polymer Physics and Chemistry, Changchun Institute of Applied Chemistry, Chinese Academy of Sciences, Changchun 130022, China",
+        "Changchun",
+        "china",
+        "Rongjun Song, 1,2 Baoyan Zhang, 1 Baotong Huang, 2 Tao Tang 2",
+        "Synergistic Effect of Supported Nickel Catalyst with Intumescent Flame-Retardants on Flame Retardancy and Thermal Stability of Polypropylene",
+        "Synergistic Effect of Supported Nickel Catalyst with",
+        "Intumescent Flame-Retardants on Flame Retardancy",
+        "and Thermal Stability of Polypropylene",
+    ]
+
+    for test in test_strings:
+        print()
+        print(f"Original String: {test}")
+
+        result = nlpModel.detect_entity_catgr_using_nlp(test)
+        print(f"Detected entities: {result}")
+
+
+if __name__ == "__main__":
+    __main__()
diff --git a/magic_pdf/libs/ocr_content_type.py b/magic_pdf/libs/ocr_content_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..1886c82a7a2fe167eb4a6930cc03581833db17ad
--- /dev/null
+++ b/magic_pdf/libs/ocr_content_type.py
@@ -0,0 +1,21 @@
+class ContentType:
+    Image = "image"
+    Table = "table"
+    Text = "text"
+    InlineEquation = "inline_equation"
+    InterlineEquation = "interline_equation"
+    
+class BlockType:
+    Image = "image"
+    ImageBody = "image_body"
+    ImageCaption = "image_caption"
+    Table = "table"
+    TableBody = "table_body"
+    TableCaption = "table_caption"
+    TableFootnote = "table_footnote"
+    Text = "text"
+    Title = "title"
+    InterlineEquation = "interline_equation"
+    Footnote = "footnote"
+    Discarded = "discarded"
+
diff --git a/magic_pdf/libs/path_utils.py b/magic_pdf/libs/path_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..15fff01b5a698fbd6b1df11d9608b9ef12ffc715
--- /dev/null
+++ b/magic_pdf/libs/path_utils.py
@@ -0,0 +1,32 @@
+
+
+def remove_non_official_s3_args(s3path):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
+    """
+    arr = s3path.split("?")
+    return arr[0]
+
+def parse_s3path(s3path: str):
+    # from s3pathlib import S3Path
+    # p = S3Path(remove_non_official_s3_args(s3path))
+    # return p.bucket, p.key
+    s3path = remove_non_official_s3_args(s3path).strip()
+    if s3path.startswith(('s3://', 's3a://')):
+        prefix, path = s3path.split('://', 1)
+        bucket_name, key = path.split('/', 1)
+        return bucket_name, key
+    elif s3path.startswith('/'):
+        raise ValueError("The provided path starts with '/'. This does not conform to a valid S3 path format.")
+    else:
+        raise ValueError("Invalid S3 path format. Expected 's3://bucket-name/key' or 's3a://bucket-name/key'.")
+
+
+def parse_s3_range_params(s3path: str):
+    """
+    example: s3://abc/xxxx.json?bytes=0,81350 ==> [0, 81350]
+    """
+    arr = s3path.split("?bytes=")
+    if len(arr) == 1:
+        return None
+    return arr[1].split(",")
diff --git a/magic_pdf/libs/pdf_check.py b/magic_pdf/libs/pdf_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9dc350c6459261e266a6f9435f96556252b827
--- /dev/null
+++ b/magic_pdf/libs/pdf_check.py
@@ -0,0 +1,62 @@
+from io import BytesIO
+import re
+import fitz
+import numpy as np
+from loguru import logger
+from pdfminer.high_level import extract_text
+
+
+def calculate_sample_count(total_page: int):
+    """
+    根据总页数和采样率计算采样页面的数量。
+    """
+    select_page_cnt = min(10, total_page)
+    return select_page_cnt
+
+
+def extract_pages(src_pdf_bytes: bytes):
+    pdf_docs = fitz.open("pdf", src_pdf_bytes)
+    total_page = len(pdf_docs)
+    if total_page == 0:
+        # 如果PDF没有页面，直接返回空文档
+        logger.warning("PDF is empty, return empty document")
+        return fitz.Document()
+    select_page_cnt = calculate_sample_count(total_page)
+
+    page_num = np.random.choice(total_page, select_page_cnt, replace=False)
+    sample_docs = fitz.Document()
+    try:
+        for index in page_num:
+            sample_docs.insert_pdf(pdf_docs, from_page=int(index), to_page=int(index))
+    except Exception as e:
+        logger.exception(e)
+    return sample_docs
+
+
+def detect_invalid_chars(src_pdf_bytes: bytes) -> bool:
+    """"
+    检测PDF中是否包含非法字符
+    """
+    '''pdfminer比较慢,需要先随机抽取10页左右的sample'''
+    sample_docs = extract_pages(src_pdf_bytes)
+    sample_pdf_bytes = sample_docs.tobytes()
+    sample_pdf_file_like_object = BytesIO(sample_pdf_bytes)
+    text = extract_text(sample_pdf_file_like_object)
+    text = text.replace("\n", "")
+    # logger.info(text)
+    '''乱码文本用pdfminer提取出来的文本特征是(cid:xxx)'''
+    cid_pattern = re.compile(r'\(cid:\d+\)')
+    matches = cid_pattern.findall(text)
+    cid_count = len(matches)
+    cid_len = sum(len(match) for match in matches)
+    text_len = len(text)
+    if text_len == 0:
+        cid_chars_radio = 0
+    else:
+        cid_chars_radio = cid_count/(cid_count + text_len - cid_len)
+    logger.info(f"cid_count: {cid_count}, text_len: {text_len}, cid_chars_radio: {cid_chars_radio}")
+    '''当一篇文章存在5%以上的文本是乱码时,认为该文档为乱码文档'''
+    if cid_chars_radio > 0.05:
+        return False  # 乱码文档
+    else:
+        return True   # 正常文档
diff --git a/magic_pdf/libs/pdf_image_tools.py b/magic_pdf/libs/pdf_image_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e5a57eec61724540e707ae03ea4f2cf37e8d5ae
--- /dev/null
+++ b/magic_pdf/libs/pdf_image_tools.py
@@ -0,0 +1,33 @@
+
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.hash_utils import compute_sha256
+
+
+def cut_image(bbox: tuple, page_num: int, page: fitz.Page, return_path, imageWriter: AbsReaderWriter):
+    """
+    从第page_num页的page中，根据bbox进行裁剪出一张jpg图片，返回图片路径
+    save_path：需要同时支持s3和本地, 图片存放在save_path下，文件名是: {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。
+    """
+    # 拼接文件名
+    filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
+
+    # 老版本返回不带bucket的路径
+    img_path = join_path(return_path, filename) if return_path is not None else None
+
+    # 新版本生成平铺路径
+    img_hash256_path = f"{compute_sha256(img_path)}.jpg"
+
+    # 将坐标转换为fitz.Rect对象
+    rect = fitz.Rect(*bbox)
+    # 配置缩放倍数为3倍
+    zoom = fitz.Matrix(3, 3)
+    # 截取图片
+    pix = page.get_pixmap(clip=rect, matrix=zoom)
+
+    byte_data = pix.tobytes(output='jpeg', jpg_quality=95)
+
+    imageWriter.write(byte_data, img_hash256_path, AbsReaderWriter.MODE_BIN)
+
+    return img_hash256_path
diff --git a/magic_pdf/libs/safe_filename.py b/magic_pdf/libs/safe_filename.py
new file mode 100644
index 0000000000000000000000000000000000000000..1076a4bae218e180351ef2ec4692f156e03be1c7
--- /dev/null
+++ b/magic_pdf/libs/safe_filename.py
@@ -0,0 +1,11 @@
+import os
+
+
+def sanitize_filename(filename, replacement="_"):
+    if os.name == 'nt':
+        invalid_chars = '<>:"|?*'
+
+        for char in invalid_chars:
+            filename = filename.replace(char, replacement)
+
+    return filename
diff --git a/magic_pdf/libs/textbase.py b/magic_pdf/libs/textbase.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb8875b3878d6fcabb72fd3f3a2a8b6ea74b51b0
--- /dev/null
+++ b/magic_pdf/libs/textbase.py
@@ -0,0 +1,33 @@
+import math
+
+
+def __inc_dict_val(mp, key, val_inc:int):
+    if mp.get(key):
+        mp[key] = mp[key] + val_inc
+    else:
+        mp[key] = val_inc
+        
+    
+
+def get_text_block_base_info(block):
+    """
+    获取这个文本块里的字体的颜色、字号、字体
+    按照正文字数最多的返回
+    """
+    
+    counter = {}
+    
+    for line in block['lines']:
+        for span in line['spans']:
+            color = span['color']
+            size = round(span['size'], 2)
+            font = span['font']
+            
+            txt_len = len(span['text'])
+            __inc_dict_val(counter, (color, size, font), txt_len)
+            
+    
+    c, s, ft = max(counter, key=counter.get)
+    
+    return c, s, ft
+    
\ No newline at end of file
diff --git a/magic_pdf/libs/version.py b/magic_pdf/libs/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..43c4ab0058dcb9e755b966ed8bb2b13fa18f24c5
--- /dev/null
+++ b/magic_pdf/libs/version.py
@@ -0,0 +1 @@
+__version__ = "0.6.1"
diff --git a/magic_pdf/libs/vis_utils.py b/magic_pdf/libs/vis_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a4988a7775d9ed863d5a440983b67e30226e9de
--- /dev/null
+++ b/magic_pdf/libs/vis_utils.py
@@ -0,0 +1,308 @@
+from magic_pdf.libs.commons import fitz
+import os
+
+
+def draw_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, save_path: str):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(save_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(save_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open('')
+
+    color_map = {
+        'image': fitz.pdfcolor["yellow"],
+        'text': fitz.pdfcolor['blue'],
+        "table": fitz.pdfcolor['green']
+    }
+    
+    for k, v in paras_dict.items():
+        page_idx = v['page_idx']
+        width = raw_pdf_doc[page_idx].rect.width
+        height = raw_pdf_doc[page_idx].rect.height
+        new_page = doc.new_page(width=width, height=height)
+
+        shape = new_page.new_shape()
+        for order, block in enumerate(v['preproc_blocks']):
+            rect = fitz.Rect(block['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=color_map['text'], fill_opacity=0.2)
+            shape.finish()
+            shape.commit()
+            
+        for img in v['images']:
+            # 原始box画上去
+            rect = fitz.Rect(img['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=fitz.pdfcolor['yellow'])
+            shape.finish()
+            shape.commit()
+
+        for img in v['image_backup']:
+            # 原始box画上去
+            rect = fitz.Rect(img['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor['yellow'],  fill=None)
+            shape.finish()
+            shape.commit()
+            
+        for tb in v['droped_text_block']:
+            # 原始box画上去
+            rect = fitz.Rect(tb['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.4)
+            shape.finish()
+            shape.commit()
+            
+        # TODO table
+        for tb in v['tables']:
+            rect = fitz.Rect(tb['bbox'])
+            shape = new_page.new_shape()
+            shape.draw_rect(rect)
+            shape.finish(color=None, fill=fitz.pdfcolor['green'], fill_opacity=0.2)
+            shape.finish()
+            shape.commit()
+
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    if is_new_pdf:
+        doc.save(save_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+    
+
+def debug_show_bbox(raw_pdf_doc: fitz.Document, page_idx: int, bboxes: list, droped_bboxes:list,  expect_drop_bboxes:list, save_path: str, expected_page_id:int):
+    """
+    以覆盖的方式写个临时的pdf，用于debug
+    """
+    if page_idx!=expected_page_id:
+        return
+        
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+
+    width = raw_pdf_doc[page_idx].rect.width
+    height = raw_pdf_doc[page_idx].rect.height
+    new_page = doc.new_page(width=width, height=height)
+
+    shape = new_page.new_shape()
+    for bbox in bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in droped_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in expect_drop_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close()
+    
+
+def debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+    
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+        
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+        
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close() 
+    
+    
+    
+    
+def draw_layout_bbox_on_page(raw_pdf_doc: fitz.Document, paras_dict:dict, header, footer, pdf_path: str):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open('')
+
+    for k, v in paras_dict.items():
+        page_idx = v['page_idx']
+        layouts = v['layout_bboxes']
+        page = doc[page_idx]
+        shape = page.new_shape()
+        for order, layout in enumerate(layouts):
+            border_offset = 1
+            rect_box = layout['layout_bbox']
+            layout_label = layout['layout_label']
+            fill_color = fitz.pdfcolor['pink'] if layout_label=='U' else None
+            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.4)
+            """
+            draw order text on layout box
+            """
+            font_size = 10
+            shape.insert_text((rect_box[0] + 1, rect_box[1] + font_size), f"{order}", fontsize=font_size, color=(0, 0, 0))
+        
+        """画上footer header"""
+        if header:
+            shape.draw_rect(fitz.Rect(header))
+            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
+        if footer:
+            shape.draw_rect(fitz.Rect(footer))
+            shape.finish(color=None, fill=fitz.pdfcolor['black'], fill_opacity=0.2)
+        
+        shape.commit()
+    
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+        
+
+@DeprecationWarning
+def draw_layout_on_page(raw_pdf_doc: fitz.Document,  page_idx: int, page_layout: list, pdf_path: str):
+    """
+    把layout的box用红色边框花在pdf_path的page_idx上
+    """
+    def draw(shape, layout, fill_color=fitz.pdfcolor['pink']):
+        border_offset = 1
+        rect_box = layout['layout_bbox']
+        layout_label = layout['layout_label']
+        sub_layout = layout['sub_layout']
+        if len(sub_layout)==0:
+            fill_color = fill_color if layout_label=='U' else None
+            rect_box = [rect_box[0]+1, rect_box[1]-border_offset, rect_box[2]-1, rect_box[3]+border_offset]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor['red'], fill=fill_color, fill_opacity=0.2)
+            # if layout_label=='U':
+            #     bad_boxes = layout.get("bad_boxes", [])
+            #     for bad_box in bad_boxes:
+            #         rect = fitz.Rect(*bad_box)
+            #         shape.draw_rect(rect)
+            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
+        # else:
+        #     rect = fitz.Rect(*rect_box)
+        #     shape.draw_rect(rect)
+        #     shape.finish(color=fitz.pdfcolor['blue'])
+        
+        for sub_layout in sub_layout:
+            draw(shape, sub_layout)
+        shape.commit()
+        
+    
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open('')
+
+    page = doc[page_idx]
+    shape = page.new_shape()
+    for order, layout in enumerate(page_layout):
+        draw(shape, layout, fitz.pdfcolor['yellow'])
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(pdf_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+    
\ No newline at end of file
diff --git a/magic_pdf/model/__init__.py b/magic_pdf/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..08653f24e6f81056e2ad82b94f17c7efbd87f714
--- /dev/null
+++ b/magic_pdf/model/__init__.py
@@ -0,0 +1,2 @@
+__use_inside_model__ = True
+__model_mode__ = "full"
diff --git a/magic_pdf/model/doc_analyze_by_custom_model.py b/magic_pdf/model/doc_analyze_by_custom_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6916e74b5bf6f2b81e08e76c3e08108e7e55527
--- /dev/null
+++ b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -0,0 +1,118 @@
+import time
+
+import fitz
+import numpy as np
+from loguru import logger
+
+from magic_pdf.libs.config_reader import get_local_models_dir, get_device
+from magic_pdf.model.model_list import MODEL
+import magic_pdf.model as model_config
+
+
+def dict_compare(d1, d2):
+    return d1.items() == d2.items()
+
+
+def remove_duplicates_dicts(lst):
+    unique_dicts = []
+    for dict_item in lst:
+        if not any(
+                dict_compare(dict_item, existing_dict) for existing_dict in unique_dicts
+        ):
+            unique_dicts.append(dict_item)
+    return unique_dicts
+
+
+def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
+    try:
+        from PIL import Image
+    except ImportError:
+        logger.error("Pillow not installed, please install by pip.")
+        exit(1)
+
+    images = []
+    with fitz.open("pdf", pdf_bytes) as doc:
+        for index in range(0, doc.page_count):
+            page = doc[index]
+            mat = fitz.Matrix(dpi / 72, dpi / 72)
+            pm = page.get_pixmap(matrix=mat, alpha=False)
+
+            # if width or height > 3000 pixels, don't enlarge the image
+            if pm.width > 3000 or pm.height > 3000:
+                pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+
+            img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
+            img = np.array(img)
+            img_dict = {"img": img, "width": pm.width, "height": pm.height}
+            images.append(img_dict)
+    return images
+
+
+class ModelSingleton:
+    _instance = None
+    _models = {}
+
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def get_model(self, ocr: bool, show_log: bool):
+        key = (ocr, show_log)
+        if key not in self._models:
+            self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
+        return self._models[key]
+
+
+def custom_model_init(ocr: bool = False, show_log: bool = False):
+    model = None
+
+    if model_config.__model_mode__ == "lite":
+        model = MODEL.Paddle
+    elif model_config.__model_mode__ == "full":
+        model = MODEL.PEK
+
+    if model_config.__use_inside_model__:
+        model_init_start = time.time()
+        if model == MODEL.Paddle:
+            from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
+            custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
+        elif model == MODEL.PEK:
+            from magic_pdf.model.pdf_extract_kit import CustomPEKModel
+            # 从配置文件读取model-dir和device
+            local_models_dir = get_local_models_dir()
+            device = get_device()
+            custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device)
+        else:
+            logger.error("Not allow model_name!")
+            exit(1)
+        model_init_cost = time.time() - model_init_start
+        logger.info(f"model init cost: {model_init_cost}")
+    else:
+        logger.error("use_inside_model is False, not allow to use inside model")
+        exit(1)
+
+    return custom_model
+
+
+def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
+
+    model_manager = ModelSingleton()
+    custom_model = model_manager.get_model(ocr, show_log)
+
+    images = load_images_from_pdf(pdf_bytes)
+
+    model_json = []
+    doc_analyze_start = time.time()
+    for index, img_dict in enumerate(images):
+        img = img_dict["img"]
+        page_width = img_dict["width"]
+        page_height = img_dict["height"]
+        result = custom_model(img)
+        page_info = {"page_no": index, "height": page_height, "width": page_width}
+        page_dict = {"layout_dets": result, "page_info": page_info}
+        model_json.append(page_dict)
+    doc_analyze_cost = time.time() - doc_analyze_start
+    logger.info(f"doc analyze cost: {doc_analyze_cost}")
+
+    return model_json
diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c95f21e6ddb37c8698458f5856714c4eabb813e
--- /dev/null
+++ b/magic_pdf/model/magic_model.py
@@ -0,0 +1,636 @@
+import json
+import math
+
+from magic_pdf.libs.commons import fitz
+from loguru import logger
+
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.libs.math import float_gt
+from magic_pdf.libs.boxbase import (
+    _is_in,
+    bbox_relative_pos,
+    bbox_distance,
+    _is_part_overlap,
+    calculate_overlap_area_in_bbox1_area_ratio,
+    calculate_iou,
+)
+from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
+
+CAPATION_OVERLAP_AREA_RATIO = 0.6
+
+
+class MagicModel:
+    """
+    每个函数没有得到元素的时候返回空list
+
+    """
+
+    def __fix_axis(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            page_no = model_page_info["page_info"]["page_no"]
+            horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
+                model_page_info, self.__docs[page_no]
+            )
+            layout_dets = model_page_info["layout_dets"]
+            for layout_det in layout_dets:
+
+                if layout_det.get("bbox") is not None:
+                    # 兼容直接输出bbox的模型数据,如paddle
+                    x0, y0, x1, y1 = layout_det["bbox"]
+                else:
+                    # 兼容直接输出poly的模型数据，如xxx
+                    x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
+
+                bbox = [
+                    int(x0 / horizontal_scale_ratio),
+                    int(y0 / vertical_scale_ratio),
+                    int(x1 / horizontal_scale_ratio),
+                    int(y1 / vertical_scale_ratio),
+                ]
+                layout_det["bbox"] = bbox
+                # 删除高度或者宽度小于等于0的spans
+                if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
+                    need_remove_list.append(layout_det)
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+
+    def __fix_by_remove_low_confidence(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            layout_dets = model_page_info["layout_dets"]
+            for layout_det in layout_dets:
+                if layout_det["score"] <= 0.05:
+                    need_remove_list.append(layout_det)
+                else:
+                    continue
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+
+    def __fix_by_remove_high_iou_and_low_confidence(self):
+        for model_page_info in self.__model_list:
+            need_remove_list = []
+            layout_dets = model_page_info["layout_dets"]
+            for layout_det1 in layout_dets:
+                for layout_det2 in layout_dets:
+                    if layout_det1 == layout_det2:
+                        continue
+                    if layout_det1["category_id"] in [
+                        0,
+                        1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        8,
+                        9,
+                    ] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
+                        if (
+                            calculate_iou(layout_det1["bbox"], layout_det2["bbox"])
+                            > 0.9
+                        ):
+                            if layout_det1["score"] < layout_det2["score"]:
+                                layout_det_need_remove = layout_det1
+                            else:
+                                layout_det_need_remove = layout_det2
+
+                            if layout_det_need_remove not in need_remove_list:
+                                need_remove_list.append(layout_det_need_remove)
+                        else:
+                            continue
+                    else:
+                        continue
+            for need_remove in need_remove_list:
+                layout_dets.remove(need_remove)
+
+    def __init__(self, model_list: list, docs: fitz.Document):
+        self.__model_list = model_list
+        self.__docs = docs
+        """为所有模型数据添加bbox信息(缩放，poly->bbox)"""
+        self.__fix_axis()
+        """删除置信度特别低的模型数据(<0.05),提高质量"""
+        self.__fix_by_remove_low_confidence()
+        """删除高iou(>0.9)数据中置信度较低的那个"""
+        self.__fix_by_remove_high_iou_and_low_confidence()
+
+    def __reduct_overlap(self, bboxes):
+        N = len(bboxes)
+        keep = [True] * N
+        for i in range(N):
+            for j in range(N):
+                if i == j:
+                    continue
+                if _is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]):
+                    keep[i] = False
+
+        return [bboxes[i] for i in range(N) if keep[i]]
+
+    def __tie_up_category_by_distance(
+        self, page_no, subject_category_id, object_category_id
+    ):
+        """
+        假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object)，每个 object 只能属于一个 subject
+        """
+        ret = []
+        MAX_DIS_OF_POINT = 10**9 + 7
+
+        # subject 和 object 的 bbox 会合并成一个大的 bbox （named: merged bbox）。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
+        # 再求出筛选出的 subjects 和 object 的最短距离！
+        def may_find_other_nearest_bbox(subject_idx, object_idx):
+            ret = float("inf")
+
+            x0 = min(
+                all_bboxes[subject_idx]["bbox"][0], all_bboxes[object_idx]["bbox"][0]
+            )
+            y0 = min(
+                all_bboxes[subject_idx]["bbox"][1], all_bboxes[object_idx]["bbox"][1]
+            )
+            x1 = max(
+                all_bboxes[subject_idx]["bbox"][2], all_bboxes[object_idx]["bbox"][2]
+            )
+            y1 = max(
+                all_bboxes[subject_idx]["bbox"][3], all_bboxes[object_idx]["bbox"][3]
+            )
+
+            object_area = abs(
+                all_bboxes[object_idx]["bbox"][2] - all_bboxes[object_idx]["bbox"][0]
+            ) * abs(
+                all_bboxes[object_idx]["bbox"][3] - all_bboxes[object_idx]["bbox"][1]
+            )
+
+            for i in range(len(all_bboxes)):
+                if (
+                    i == subject_idx
+                    or all_bboxes[i]["category_id"] != subject_category_id
+                ):
+                    continue
+                if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]["bbox"]) or _is_in(
+                    all_bboxes[i]["bbox"], [x0, y0, x1, y1]
+                ):
+
+                    i_area = abs(
+                        all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
+                    ) * abs(all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1])
+                    if i_area >= object_area:
+                        ret = min(float("inf"), dis[i][object_idx])
+
+            return ret
+
+        def expand_bbbox(idxes):
+            x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes] 
+            y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes] 
+            x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes] 
+            y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes] 
+            return min(x0s), min(y0s), max(x1s), max(y1s)
+
+        subjects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {"bbox": x["bbox"], "score": x["score"]},
+                    filter(
+                        lambda x: x["category_id"] == subject_category_id,
+                        self.__model_list[page_no]["layout_dets"],
+                    ),
+                )
+            )
+        )
+
+        objects = self.__reduct_overlap(
+            list(
+                map(
+                    lambda x: {"bbox": x["bbox"], "score": x["score"]},
+                    filter(
+                        lambda x: x["category_id"] == object_category_id,
+                        self.__model_list[page_no]["layout_dets"],
+                    ),
+                )
+            )
+        )
+        subject_object_relation_map = {}
+
+        subjects.sort(
+            key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2
+        )  # get the distance !
+
+        all_bboxes = []
+
+        for v in subjects:
+            all_bboxes.append(
+                {
+                    "category_id": subject_category_id,
+                    "bbox": v["bbox"],
+                    "score": v["score"],
+                }
+            )
+
+        for v in objects:
+            all_bboxes.append(
+                {
+                    "category_id": object_category_id,
+                    "bbox": v["bbox"],
+                    "score": v["score"],
+                }
+            )
+
+        N = len(all_bboxes)
+        dis = [[MAX_DIS_OF_POINT] * N for _ in range(N)]
+
+        for i in range(N):
+            for j in range(i):
+                if (
+                    all_bboxes[i]["category_id"] == subject_category_id
+                    and all_bboxes[j]["category_id"] == subject_category_id
+                ):
+                    continue
+
+                dis[i][j] = bbox_distance(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"])
+                dis[j][i] = dis[i][j]
+
+        used = set()
+        for i in range(N):
+            # 求第 i 个 subject 所关联的 object
+            if all_bboxes[i]["category_id"] != subject_category_id:
+                continue
+            seen = set()
+            candidates = []
+            arr = []
+            for j in range(N):
+
+                pos_flag_count = sum(
+                    list(
+                        map(
+                            lambda x: 1 if x else 0,
+                            bbox_relative_pos(
+                                all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                            ),
+                        )
+                    )
+                )
+                if pos_flag_count > 1:
+                    continue
+                if (
+                    all_bboxes[j]["category_id"] != object_category_id
+                    or j in used
+                    or dis[i][j] == MAX_DIS_OF_POINT
+                ):
+                    continue
+                left, right, _, _ = bbox_relative_pos(
+                    all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                )  # 由  pos_flag_count 相关逻辑保证本段逻辑准确性
+                if left or right:
+                    one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
+                else:
+                    one_way_dis = all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1]
+                if dis[i][j] > one_way_dis:
+                    continue
+                arr.append((dis[i][j], j))
+
+            arr.sort(key=lambda x: x[0])
+            if len(arr) > 0:
+                # bug: 离该subject 最近的 object 可能跨越了其它的 subject 。比如 [this subect] [some sbuject] [the nearest objec of subject]
+                if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
+
+                    candidates.append(arr[0][1])
+                    seen.add(arr[0][1])
+
+            # 已经获取初始种子
+            for j in set(candidates):
+                tmp = []
+                for k in range(i + 1, N):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    all_bboxes[j]["bbox"], all_bboxes[k]["bbox"]
+                                ),
+                            )
+                        )
+                    )
+
+                    if pos_flag_count > 1:
+                        continue
+
+                    if (
+                        all_bboxes[k]["category_id"] != object_category_id
+                        or k in used
+                        or k in seen
+                        or dis[j][k] == MAX_DIS_OF_POINT
+                        or dis[j][k] > dis[i][j]
+                    ):
+                        continue
+
+                    is_nearest = True
+                    for l in range(i + 1, N):
+                        if l in (j, k) or l in used or l in seen:
+                            continue
+
+                        if not float_gt(dis[l][k], dis[j][k]):
+                            is_nearest = False
+                            break
+
+                    if is_nearest:
+                        nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
+                        n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1])
+                        if float_gt(dis[i][j], n_dis):
+                            continue
+                        tmp.append(k)
+                        seen.add(k)
+
+                candidates = tmp
+                if len(candidates) == 0:
+                    break
+
+            # 已经获取到某个 figure 下所有的最靠近的 captions，以及最靠近这些 captions 的 captions 。
+            # 先扩一下 bbox，
+            ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
+            ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
+
+            # 分成了 4 个截取空间，需要计算落在每个截取空间下 objects 合并后占据的矩形面积
+            caption_poses = [
+                [ox0, oy0, ix0, oy1],
+                [ox0, oy0, ox1, iy0],
+                [ox0, iy1, ox1, oy1],
+                [ix1, oy0, ox1, oy1],
+            ]
+
+            caption_areas = []
+            for bbox in caption_poses:
+                embed_arr = []
+                for idx in seen:
+                    if (
+                        calculate_overlap_area_in_bbox1_area_ratio(
+                            all_bboxes[idx]["bbox"], bbox
+                        )
+                        > CAPATION_OVERLAP_AREA_RATIO
+                    ):
+                        embed_arr.append(idx)
+
+                if len(embed_arr) > 0:
+                    embed_x0 = min([all_bboxes[idx]["bbox"][0] for idx in embed_arr])
+                    embed_y0 = min([all_bboxes[idx]["bbox"][1] for idx in embed_arr])
+                    embed_x1 = max([all_bboxes[idx]["bbox"][2] for idx in embed_arr])
+                    embed_y1 = max([all_bboxes[idx]["bbox"][3] for idx in embed_arr])
+                    caption_areas.append(
+                        int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
+                    )
+                else:
+                    caption_areas.append(0)
+
+            subject_object_relation_map[i] = []
+            if max(caption_areas) > 0:
+                max_area_idx = caption_areas.index(max(caption_areas))
+                caption_bbox = caption_poses[max_area_idx]
+
+                for j in seen:
+                    if (
+                        calculate_overlap_area_in_bbox1_area_ratio(
+                            all_bboxes[j]["bbox"], caption_bbox
+                        )
+                        > CAPATION_OVERLAP_AREA_RATIO
+                    ):
+                        used.add(j)
+                        subject_object_relation_map[i].append(j)
+
+        for i in sorted(subject_object_relation_map.keys()):
+            result = {
+                "subject_body": all_bboxes[i]["bbox"],
+                "all": all_bboxes[i]["bbox"],
+                "score": all_bboxes[i]["score"],
+            }
+
+            if len(subject_object_relation_map[i]) > 0:
+                x0 = min(
+                    [all_bboxes[j]["bbox"][0] for j in subject_object_relation_map[i]]
+                )
+                y0 = min(
+                    [all_bboxes[j]["bbox"][1] for j in subject_object_relation_map[i]]
+                )
+                x1 = max(
+                    [all_bboxes[j]["bbox"][2] for j in subject_object_relation_map[i]]
+                )
+                y1 = max(
+                    [all_bboxes[j]["bbox"][3] for j in subject_object_relation_map[i]]
+                )
+                result["object_body"] = [x0, y0, x1, y1]
+                result["all"] = [
+                    min(x0, all_bboxes[i]["bbox"][0]),
+                    min(y0, all_bboxes[i]["bbox"][1]),
+                    max(x1, all_bboxes[i]["bbox"][2]),
+                    max(y1, all_bboxes[i]["bbox"][3]),
+                ]
+            ret.append(result)
+
+        total_subject_object_dis = 0
+        # 计算已经配对的 distance 距离
+        for i in subject_object_relation_map.keys():
+            for j in subject_object_relation_map[i]:
+                total_subject_object_dis += bbox_distance(
+                    all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                )
+
+        # 计算未匹配的 subject 和 object 的距离（非精确版）
+        with_caption_subject = set(
+            [
+                key
+                for key in subject_object_relation_map.keys()
+                if len(subject_object_relation_map[i]) > 0
+            ]
+        )
+        for i in range(N):
+            if all_bboxes[i]["category_id"] != object_category_id or i in used:
+                continue
+            candidates = []
+            for j in range(N):
+                if (
+                    all_bboxes[j]["category_id"] != subject_category_id
+                    or j in with_caption_subject
+                ):
+                    continue
+                candidates.append((dis[i][j], j))
+            if len(candidates) > 0:
+                candidates.sort(key=lambda x: x[0])
+                total_subject_object_dis += candidates[0][1]
+                with_caption_subject.add(j)
+        return ret, total_subject_object_dis
+
+    def get_imgs(self, page_no: int):
+        figure_captions, _ = self.__tie_up_category_by_distance(
+            page_no, 3, 4
+        )
+        return [
+            {
+                "bbox": record["all"],
+                "img_body_bbox": record["subject_body"],
+                "img_caption_bbox": record.get("object_body", None),
+                "score": record["score"],
+            }
+            for record in figure_captions
+        ]
+
+    def get_tables(
+        self, page_no: int
+    ) -> list:  # 3个坐标， caption, table主体，table-note
+        with_captions, _ = self.__tie_up_category_by_distance(page_no, 5, 6)
+        with_footnotes, _ = self.__tie_up_category_by_distance(page_no, 5, 7)
+        ret = []
+        N, M = len(with_captions), len(with_footnotes)
+        assert N == M
+        for i in range(N):
+            record = {
+                "score": with_captions[i]["score"],
+                "table_caption_bbox": with_captions[i].get("object_body", None),
+                "table_body_bbox": with_captions[i]["subject_body"],
+                "table_footnote_bbox": with_footnotes[i].get("object_body", None),
+            }
+
+            x0 = min(with_captions[i]["all"][0], with_footnotes[i]["all"][0])
+            y0 = min(with_captions[i]["all"][1], with_footnotes[i]["all"][1])
+            x1 = max(with_captions[i]["all"][2], with_footnotes[i]["all"][2])
+            y1 = max(with_captions[i]["all"][3], with_footnotes[i]["all"][3])
+            record["bbox"] = [x0, y0, x1, y1]
+            ret.append(record)
+        return ret
+
+    def get_equations(self, page_no: int) -> list:  # 有坐标，也有字
+        inline_equations = self.__get_blocks_by_type(
+            ModelBlockTypeEnum.EMBEDDING.value, page_no, ["latex"]
+        )
+        interline_equations = self.__get_blocks_by_type(
+            ModelBlockTypeEnum.ISOLATED.value, page_no, ["latex"]
+        )
+        interline_equations_blocks = self.__get_blocks_by_type(
+            ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
+        )
+        return inline_equations, interline_equations, interline_equations_blocks
+
+    def get_discarded(self, page_no: int) -> list:  # 自研模型，只有坐标
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.ABANDON.value, page_no)
+        return blocks
+
+    def get_text_blocks(self, page_no: int) -> list:  # 自研模型搞的，只有坐标，没有字
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.PLAIN_TEXT.value, page_no)
+        return blocks
+
+    def get_title_blocks(self, page_no: int) -> list:  # 自研模型，只有坐标，没字
+        blocks = self.__get_blocks_by_type(ModelBlockTypeEnum.TITLE.value, page_no)
+        return blocks
+
+    def get_ocr_text(self, page_no: int) -> list:  # paddle 搞的，有字也有坐标
+        text_spans = []
+        model_page_info = self.__model_list[page_no]
+        layout_dets = model_page_info["layout_dets"]
+        for layout_det in layout_dets:
+            if layout_det["category_id"] == "15":
+                span = {
+                    "bbox": layout_det["bbox"],
+                    "content": layout_det["text"],
+                }
+                text_spans.append(span)
+        return text_spans
+
+    def get_all_spans(self, page_no: int) -> list:
+        def remove_duplicate_spans(spans):
+            new_spans = []
+            for span in spans:
+                if not any(span == existing_span for existing_span in new_spans):
+                    new_spans.append(span)
+            return new_spans
+
+        all_spans = []
+        model_page_info = self.__model_list[page_no]
+        layout_dets = model_page_info["layout_dets"]
+        allow_category_id_list = [3, 5, 13, 14, 15]
+        """当成span拼接的"""
+        #  3: 'image', # 图片
+        #  5: 'table',       # 表格
+        #  13: 'inline_equation',     # 行内公式
+        #  14: 'interline_equation',      # 行间公式
+        #  15: 'text',      # ocr识别文本
+        for layout_det in layout_dets:
+            category_id = layout_det["category_id"]
+            if category_id in allow_category_id_list:
+                span = {"bbox": layout_det["bbox"], "score": layout_det["score"]}
+                if category_id == 3:
+                    span["type"] = ContentType.Image
+                elif category_id == 5:
+                    span["type"] = ContentType.Table
+                elif category_id == 13:
+                    span["content"] = layout_det["latex"]
+                    span["type"] = ContentType.InlineEquation
+                elif category_id == 14:
+                    span["content"] = layout_det["latex"]
+                    span["type"] = ContentType.InterlineEquation
+                elif category_id == 15:
+                    span["content"] = layout_det["text"]
+                    span["type"] = ContentType.Text
+                all_spans.append(span)
+        return remove_duplicate_spans(all_spans)
+
+    def get_page_size(self, page_no: int):  # 获取页面宽高
+        # 获取当前页的page对象
+        page = self.__docs[page_no]
+        # 获取当前页的宽高
+        page_w = page.rect.width
+        page_h = page.rect.height
+        return page_w, page_h
+
+    def __get_blocks_by_type(
+        self, type: int, page_no: int, extra_col: list[str] = []
+    ) -> list:
+        blocks = []
+        for page_dict in self.__model_list:
+            layout_dets = page_dict.get("layout_dets", [])
+            page_info = page_dict.get("page_info", {})
+            page_number = page_info.get("page_no", -1)
+            if page_no != page_number:
+                continue
+            for item in layout_dets:
+                category_id = item.get("category_id", -1)
+                bbox = item.get("bbox", None)
+
+                if category_id == type:
+                    block = {
+                        "bbox": bbox,
+                        "score": item.get("score"),
+                    }
+                    for col in extra_col:
+                        block[col] = item.get(col, None)
+                    blocks.append(block)
+        return blocks
+
+    def get_model_list(self, page_no):
+        return self.__model_list[page_no]
+
+
+if __name__ == "__main__":
+    drw = DiskReaderWriter(r"D:/project/20231108code-clean")
+    if 0:
+        pdf_file_path = r"linshixuqiu\19983-00.pdf"
+        model_file_path = r"linshixuqiu\19983-00_new.json"
+        pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
+        model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
+        model_list = json.loads(model_json_txt)
+        write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
+        img_bucket_path = "imgs"
+        img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
+        pdf_docs = fitz.open("pdf", pdf_bytes)
+        magic_model = MagicModel(model_list, pdf_docs)
+
+    if 1:
+        model_list = json.loads(
+            drw.read("/opt/data/pdf/20240418/j.chroma.2009.03.042.json")
+        )
+        pdf_bytes = drw.read(
+            "/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf", AbsReaderWriter.MODE_BIN
+        )
+        pdf_docs = fitz.open("pdf", pdf_bytes)
+        magic_model = MagicModel(model_list, pdf_docs)
+        for i in range(7):
+            print(magic_model.get_imgs(i))
diff --git a/magic_pdf/model/model_list.py b/magic_pdf/model/model_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b4c589b306c097a251d4185f800a15a6b11ca53
--- /dev/null
+++ b/magic_pdf/model/model_list.py
@@ -0,0 +1,3 @@
+class MODEL:
+    Paddle = "pp_structure_v2"
+    PEK = "pdf_extract_kit"
diff --git a/magic_pdf/model/pdf_extract_kit.py b/magic_pdf/model/pdf_extract_kit.py
new file mode 100644
index 0000000000000000000000000000000000000000..76a1243b2fe476ce81c3ccbeda877e4eb40ed140
--- /dev/null
+++ b/magic_pdf/model/pdf_extract_kit.py
@@ -0,0 +1,200 @@
+from loguru import logger
+import os
+try:
+    import cv2
+    import yaml
+    import time
+    import argparse
+    import numpy as np
+    import torch
+
+    from paddleocr import draw_ocr
+    from PIL import Image
+    from torchvision import transforms
+    from torch.utils.data import Dataset, DataLoader
+    from ultralytics import YOLO
+    from unimernet.common.config import Config
+    import unimernet.tasks as tasks
+    from unimernet.processors import load_processor
+
+    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
+    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+except ImportError:
+    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+    exit(1)
+
+
+def mfd_model_init(weight):
+    mfd_model = YOLO(weight)
+    return mfd_model
+
+
+def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
+    args = argparse.Namespace(cfg_path=cfg_path, options=None)
+    cfg = Config(args)
+    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin")
+    cfg.config.model.model_config.model_name = weight_dir
+    cfg.config.model.tokenizer_config.path = weight_dir
+    task = tasks.setup_task(cfg)
+    model = task.build_model(cfg)
+    model = model.to(_device_)
+    vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
+    return model, vis_processor
+
+
+def layout_model_init(weight, config_file, device):
+    model = Layoutlmv3_Predictor(weight, config_file, device)
+    return model
+
+
+class MathDataset(Dataset):
+    def __init__(self, image_paths, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+
+    def __len__(self):
+        return len(self.image_paths)
+
+    def __getitem__(self, idx):
+        # if not pil image, then convert to pil image
+        if isinstance(self.image_paths[idx], str):
+            raw_image = Image.open(self.image_paths[idx])
+        else:
+            raw_image = self.image_paths[idx]
+        if self.transform:
+            image = self.transform(raw_image)
+            return image
+
+
+class CustomPEKModel:
+
+    def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
+        """
+        ======== model init ========
+        """
+        # 获取当前文件（即 pdf_extract_kit.py）的绝对路径
+        current_file_path = os.path.abspath(__file__)
+        # 获取当前文件所在的目录(model)
+        current_dir = os.path.dirname(current_file_path)
+        # 上一级目录(magic_pdf)
+        root_dir = os.path.dirname(current_dir)
+        # model_config目录
+        model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
+        # 构建 model_configs.yaml 文件的完整路径
+        config_path = os.path.join(model_config_dir, 'model_configs.yaml')
+        with open(config_path, "r") as f:
+            self.configs = yaml.load(f, Loader=yaml.FullLoader)
+        # 初始化解析配置
+        self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
+        self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        self.apply_ocr = ocr
+        logger.info(
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr
+            )
+        )
+        assert self.apply_layout, "DocAnalysis must contain layout model."
+        # 初始化解析方案
+        self.device = kwargs.get("device", self.configs["config"]["device"])
+        logger.info("using device: {}".format(self.device))
+        models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
+
+        # 初始化公式识别
+        if self.apply_formula:
+            # 初始化公式检测模型
+            self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+
+            # 初始化公式解析模型
+            mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"]))
+            mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml"))
+            self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
+            self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+
+        # 初始化layout模型
+        self.layout_model = Layoutlmv3_Predictor(
+            str(os.path.join(models_dir, self.configs['weights']['layout'])),
+            str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+            device=self.device
+        )
+        # 初始化ocr
+        if self.apply_ocr:
+            self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
+
+        logger.info('DocAnalysis init done!')
+
+    def __call__(self, image):
+
+        latex_filling_list = []
+        mf_image_list = []
+
+        # layout检测
+        layout_start = time.time()
+        layout_res = self.layout_model(image, ignore_catids=[])
+        layout_cost = round(time.time() - layout_start, 2)
+        logger.info(f"layout detection cost: {layout_cost}")
+
+        # 公式检测
+        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
+            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+            new_item = {
+                'category_id': 13 + int(cla.item()),
+                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                'score': round(float(conf.item()), 2),
+                'latex': '',
+            }
+            layout_res.append(new_item)
+            latex_filling_list.append(new_item)
+            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+            mf_image_list.append(bbox_img)
+
+        # 公式识别
+        mfr_start = time.time()
+        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
+        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+        mfr_res = []
+        for mf_img in dataloader:
+            mf_img = mf_img.to(self.device)
+            output = self.mfr_model.generate({'image': mf_img})
+            mfr_res.extend(output['pred_str'])
+        for res, latex in zip(latex_filling_list, mfr_res):
+            res['latex'] = latex_rm_whitespace(latex)
+        mfr_cost = round(time.time() - mfr_start, 2)
+        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+
+        # ocr识别
+        if self.apply_ocr:
+            ocr_start = time.time()
+            pil_img = Image.fromarray(image)
+            single_page_mfdetrec_res = []
+            for res in layout_res:
+                if int(res['category_id']) in [13, 14]:
+                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                    single_page_mfdetrec_res.append({
+                        "bbox": [xmin, ymin, xmax, ymax],
+                    })
+            for res in layout_res:
+                if int(res['category_id']) in [0, 1, 2, 4, 6, 7]:  # 需要进行ocr的类别
+                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                    crop_box = (xmin, ymin, xmax, ymax)
+                    cropped_img = Image.new('RGB', pil_img.size, 'white')
+                    cropped_img.paste(pil_img.crop(crop_box), crop_box)
+                    cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR)
+                    ocr_res = self.ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0]
+                    if ocr_res:
+                        for box_ocr_res in ocr_res:
+                            p1, p2, p3, p4 = box_ocr_res[0]
+                            text, score = box_ocr_res[1]
+                            layout_res.append({
+                                'category_id': 15,
+                                'poly': p1 + p2 + p3 + p4,
+                                'score': round(score, 2),
+                                'text': text,
+                            })
+            ocr_cost = round(time.time() - ocr_start, 2)
+            logger.info(f"ocr cost: {ocr_cost}")
+
+        return layout_res
diff --git a/magic_pdf/model/pek_sub_modules/__init__.py b/magic_pdf/model/pek_sub_modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..5364f862e78205c65ffe3fdeba6aef09da148c39
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py
@@ -0,0 +1,179 @@
+# --------------------------------------------------------------------------------
+# VIT: Multi-Path Vision Transformer for Dense Prediction
+# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI).
+# All Rights Reserved.
+# Written by Youngwan Lee
+# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# CoaT: https://github.com/mlpc-ucsd/CoaT
+# --------------------------------------------------------------------------------
+
+
+import torch
+
+from detectron2.layers import (
+    ShapeSpec,
+)
+from detectron2.modeling import Backbone, BACKBONE_REGISTRY, FPN
+from detectron2.modeling.backbone.fpn import LastLevelP6P7, LastLevelMaxPool
+
+from .beit import beit_base_patch16, dit_base_patch16, dit_large_patch16, beit_large_patch16
+from .deit import deit_base_patch16, mae_base_patch16
+from .layoutlmft.models.layoutlmv3 import LayoutLMv3Model
+from transformers import AutoConfig
+
+__all__ = [
+    "build_vit_fpn_backbone",
+]
+
+
+class VIT_Backbone(Backbone):
+    """
+    Implement VIT backbone.
+    """
+
+    def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwargs,
+                 config_path=None, image_only=False, cfg=None):
+        super().__init__()
+        self._out_features = out_features
+        if 'base' in name:
+            self._out_feature_strides = {"layer3": 4, "layer5": 8, "layer7": 16, "layer11": 32}
+            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
+        else:
+            self._out_feature_strides = {"layer7": 4, "layer11": 8, "layer15": 16, "layer23": 32}
+            self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024}
+
+        if name == 'beit_base_patch16':
+            model_func = beit_base_patch16
+        elif name == 'dit_base_patch16':
+            model_func = dit_base_patch16
+        elif name == "deit_base_patch16":
+            model_func = deit_base_patch16
+        elif name == "mae_base_patch16":
+            model_func = mae_base_patch16
+        elif name == "dit_large_patch16":
+            model_func = dit_large_patch16
+        elif name == "beit_large_patch16":
+            model_func = beit_large_patch16
+
+        if 'beit' in name or 'dit' in name:
+            if pos_type == "abs":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_abs_pos_emb=True,
+                                           **model_kwargs)
+            elif pos_type == "shared_rel":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_shared_rel_pos_bias=True,
+                                           **model_kwargs)
+            elif pos_type == "rel":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_rel_pos_bias=True,
+                                           **model_kwargs)
+            else:
+                raise ValueError()
+        elif "layoutlmv3" in name:
+            config = AutoConfig.from_pretrained(config_path)
+            # disable relative bias as DiT
+            config.has_spatial_attention_bias = False
+            config.has_relative_attention_bias = False
+            self.backbone = LayoutLMv3Model(config, detection=True,
+                                               out_features=out_features, image_only=image_only)
+        else:
+            self.backbone = model_func(img_size=img_size,
+                                       out_features=out_features,
+                                       drop_path_rate=drop_path,
+                                       **model_kwargs)
+        self.name = name
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        if "layoutlmv3" in self.name:
+            return self.backbone.forward(
+                input_ids=x["input_ids"] if "input_ids" in x else None,
+                bbox=x["bbox"] if "bbox" in x else None,
+                images=x["images"] if "images" in x else None,
+                attention_mask=x["attention_mask"] if "attention_mask" in x else None,
+                # output_hidden_states=True,
+            )
+        assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        return self.backbone.forward_features(x)
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+
+
+def build_VIT_backbone(cfg):
+    """
+    Create a VIT instance from config.
+
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        A VIT backbone instance.
+    """
+    # fmt: off
+    name = cfg.MODEL.VIT.NAME
+    out_features = cfg.MODEL.VIT.OUT_FEATURES
+    drop_path = cfg.MODEL.VIT.DROP_PATH
+    img_size = cfg.MODEL.VIT.IMG_SIZE
+    pos_type = cfg.MODEL.VIT.POS_TYPE
+
+    model_kwargs = eval(str(cfg.MODEL.VIT.MODEL_KWARGS).replace("`", ""))
+
+    if 'layoutlmv3' in name:
+        if cfg.MODEL.CONFIG_PATH != '':
+            config_path = cfg.MODEL.CONFIG_PATH
+        else:
+            config_path = cfg.MODEL.WEIGHTS.replace('pytorch_model.bin', '')  # layoutlmv3 pre-trained models
+            config_path = config_path.replace('model_final.pth', '')  # detection fine-tuned models
+    else:
+        config_path = None
+
+    return VIT_Backbone(name, out_features, drop_path, img_size, pos_type, model_kwargs,
+                        config_path=config_path, image_only=cfg.MODEL.IMAGE_ONLY, cfg=cfg)
+
+
+@BACKBONE_REGISTRY.register()
+def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Create a VIT w/ FPN backbone.
+
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_VIT_backbone(cfg)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d4fabdc7816f19a8810e3c443643bc9e53e6b9
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py
@@ -0,0 +1,671 @@
+""" Vision Transformer (ViT) in PyTorch
+
+A PyTorch implement of Vision Transformers as described in
+'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' - https://arxiv.org/abs/2010.11929
+
+The official jax code is released and available at https://github.com/google-research/vision_transformer
+
+Status/TODO:
+* Models updated to be compatible with official impl. Args added to support backward compat for old PyTorch weights.
+* Weights ported from official jax impl for 384x384 base and small models, 16x16 and 32x32 patches.
+* Trained (supervised on ImageNet-1k) my custom 'small' patch model to 77.9, 'base' to 79.4 top-1 with this code.
+* Hopefully find time and GPUs for SSL or unsupervised pretraining on OpenImages w/ ImageNet fine-tune in future.
+
+Acknowledgments:
+* The paper authors for releasing code and weights, thanks!
+* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out
+for some einops/einsum fun
+* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
+* Bert reference code checks against Huggingface Transformers and Tensorflow Bert
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import warnings
+import math
+import torch
+from functools import partial
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index", relative_position_index)
+
+            # trunc_normal_(self.relative_position_bias_table, std=.0)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None, training_window_size=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if self.relative_position_bias_table is not None:
+            if training_window_size == self.window_size:
+                relative_position_bias = \
+                    self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                        self.window_size[0] * self.window_size[1] + 1,
+                        self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0)
+            else:
+                training_window_size = tuple(training_window_size.tolist())
+                new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3
+                # new_num_relative_dis 为 所有可能的相对位置选项，包含cls-cls，tok-cls，与cls-tok
+                new_relative_position_bias_table = F.interpolate(
+                    self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads,
+                                                                                 2 * self.window_size[0] - 1,
+                                                                                 2 * self.window_size[1] - 1),
+                    size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic',
+                    align_corners=False)
+                new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads,
+                                                                                         new_num_relative_distance - 3).permute(
+                    1, 0)
+                new_relative_position_bias_table = torch.cat(
+                    [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0)
+
+                # get pair-wise relative position index for each token inside the window
+                coords_h = torch.arange(training_window_size[0])
+                coords_w = torch.arange(training_window_size[1])
+                coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+                coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+                relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+                relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+                relative_coords[:, :, 0] += training_window_size[0] - 1  # shift to start from 0
+                relative_coords[:, :, 1] += training_window_size[1] - 1
+                relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1
+                relative_position_index = \
+                    torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2,
+                                dtype=relative_coords.dtype)
+                relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+                relative_position_index[0, 0:] = new_num_relative_distance - 3
+                relative_position_index[0:, 0] = new_num_relative_distance - 2
+                relative_position_index[0, 0] = new_num_relative_distance - 1
+
+                relative_position_bias = \
+                    new_relative_position_bias_table[relative_position_index.view(-1)].view(
+                        training_window_size[0] * training_window_size[1] + 1,
+                        training_window_size[0] * training_window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0)
+
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if init_values is not None:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None, training_window_size=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, training_window_size=training_window_size))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias,
+                                                            training_window_size=training_window_size))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=[224, 224], patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches_w = self.patch_shape[0]
+        self.num_patches_h = self.patch_shape[1]
+        # the so-called patch_shape is the patch shape during pre-training
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, position_embedding=None, **kwargs):
+        # FIXME look at relaxing size constraints
+        # assert H == self.img_size[0] and W == self.img_size[1], \
+        #     f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x)
+        Hp, Wp = x.shape[2], x.shape[3]
+
+        if position_embedding is not None:
+            # interpolate the position embedding to the corresponding size
+            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3,
+                                                                                                                  1, 2)
+            position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
+            x = x + position_embedding
+
+        x = x.flatten(2).transpose(1, 2)
+        return x, (Hp, Wp)
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self, backbone, img_size=[224, 224], feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+
+    def forward(self, training_window_size):
+        if training_window_size == self.window_size:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        else:
+            training_window_size = tuple(training_window_size.tolist())
+            new_num_relative_distance = (2 * training_window_size[0] - 1) * (2 * training_window_size[1] - 1) + 3
+            # new_num_relative_dis 为 所有可能的相对位置选项，包含cls-cls，tok-cls，与cls-tok
+            new_relative_position_bias_table = F.interpolate(
+                self.relative_position_bias_table[:-3, :].permute(1, 0).view(1, self.num_heads,
+                                                                             2 * self.window_size[0] - 1,
+                                                                             2 * self.window_size[1] - 1),
+                size=(2 * training_window_size[0] - 1, 2 * training_window_size[1] - 1), mode='bicubic',
+                align_corners=False)
+            new_relative_position_bias_table = new_relative_position_bias_table.view(self.num_heads,
+                                                                                     new_num_relative_distance - 3).permute(
+                1, 0)
+            new_relative_position_bias_table = torch.cat(
+                [new_relative_position_bias_table, self.relative_position_bias_table[-3::]], dim=0)
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(training_window_size[0])
+            coords_w = torch.arange(training_window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += training_window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += training_window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * training_window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(training_window_size[0] * training_window_size[1] + 1,) * 2,
+                            dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = new_num_relative_distance - 3
+            relative_position_index[0:, 0] = new_num_relative_distance - 2
+            relative_position_index[0, 0] = new_num_relative_distance - 1
+
+            relative_position_bias = \
+                new_relative_position_bias_table[relative_position_index.view(-1)].view(
+                    training_window_size[0] * training_window_size[1] + 1,
+                    training_window_size[0] * training_window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+        return relative_position_bias
+
+
+class BEiT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size=[224, 224],
+                 patch_size=16,
+                 in_chans=3,
+                 num_classes=80,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=None,
+                 init_values=None,
+                 use_abs_pos_emb=False,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 use_checkpoint=True,
+                 pretrained=None,
+                 out_features=None,
+                 ):
+
+        super(BEiT, self).__init__()
+
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.use_checkpoint = use_checkpoint
+
+        if hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.out_features = out_features
+        self.out_indices = [int(name[5:]) for name in out_features]
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        self.use_shared_rel_pos_bias = use_shared_rel_pos_bias
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+
+        # trunc_normal_(self.mask_token, std=.02)
+
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+                # nn.SyncBatchNorm(embed_dim),
+                nn.BatchNorm2d(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn3 = nn.Identity()
+
+            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Identity()
+
+            self.fpn3 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+
+            self.fpn4 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    '''
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+        logger = get_root_logger()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}")
+            load_checkpoint(self,
+                            filename=self.init_cfg['checkpoint'],
+                            strict=False,
+                            logger=logger,
+                            beit_spec_expand_rel_pos = self.use_rel_pos_bias,
+                            )
+    '''
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward_features(self, x):
+        B, C, H, W = x.shape
+        x, (Hp, Wp) = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None)
+        # Hp, Wp are HW for patches
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.pos_embed is not None:
+            cls_tokens = cls_tokens + self.pos_embed[:, :1, :]
+        x = torch.cat((cls_tokens, x), dim=1)
+        x = self.pos_drop(x)
+
+        features = []
+        training_window_size = torch.tensor([Hp, Wp])
+
+        rel_pos_bias = self.rel_pos_bias(training_window_size) if self.rel_pos_bias is not None else None
+
+        for i, blk in enumerate(self.blocks):
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias, training_window_size)
+            else:
+                x = blk(x, rel_pos_bias=rel_pos_bias, training_window_size=training_window_size)
+            if i in self.out_indices:
+                xp = x[:, 1:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp)
+                features.append(xp.contiguous())
+
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+
+        feat_out = {}
+
+        for name, value in zip(self.out_features, features):
+            feat_out[name] = value
+
+        return feat_out
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+
+def beit_base_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=None,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+
+def beit_large_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=None,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+
+def dit_base_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=0.1,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+
+def dit_large_patch16(pretrained=False, **kwargs):
+    model = BEiT(
+        patch_size=16,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        qkv_bias=True,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        init_values=1e-5,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+
+if __name__ == '__main__':
+    model = BEiT(use_checkpoint=True, use_shared_rel_pos_bias=True)
+    model = model.to("cuda:0")
+    input1 = torch.rand(2, 3, 512, 762).to("cuda:0")
+    input2 = torch.rand(2, 3, 800, 1200).to("cuda:0")
+    input3 = torch.rand(2, 3, 720, 1000).to("cuda:0")
+    output1 = model(input1)
+    output2 = model(input2)
+    output3 = model(input3)
+    print("all done")
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a13bb0a8514df29fb4b0ec58c3726ba9c221a8a
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py
@@ -0,0 +1,476 @@
+"""
+Mostly copy-paste from DINO and timm library:
+https://github.com/facebookresearch/dino
+https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
+"""
+import warnings
+
+import math
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import trunc_normal_, drop_path, to_2tuple
+from functools import partial
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x):
+        B, N, C = x.shape
+        q, k, v = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                      C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer, drop=drop)
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        self.window_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+
+        self.num_patches_w, self.num_patches_h = self.window_size
+
+        self.num_patches = self.window_size[0] * self.window_size[1]
+        self.img_size = img_size
+        self.patch_size = patch_size
+
+        self.proj = nn.Conv2d(in_chans, embed_dim,
+                              kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x):
+        x = self.proj(x)
+        return x
+
+
+class HybridEmbed(nn.Module):
+    """ CNN Feature Map Embedding
+    Extract feature map from CNN, flatten, project to embedding dim.
+    """
+
+    def __init__(self, backbone, img_size=224, feature_size=None, in_chans=3, embed_dim=768):
+        super().__init__()
+        assert isinstance(backbone, nn.Module)
+        img_size = to_2tuple(img_size)
+        self.img_size = img_size
+        self.backbone = backbone
+        if feature_size is None:
+            with torch.no_grad():
+                # FIXME this is hacky, but most reliable way of determining the exact dim of the output feature
+                # map for all networks, the feature metadata has reliable channel and stride info, but using
+                # stride to calc feature dim requires info about padding of each stage that isn't captured.
+                training = backbone.training
+                if training:
+                    backbone.eval()
+                o = self.backbone(torch.zeros(
+                    1, in_chans, img_size[0], img_size[1]))[-1]
+                feature_size = o.shape[-2:]
+                feature_dim = o.shape[1]
+                backbone.train(training)
+        else:
+            feature_size = to_2tuple(feature_size)
+            feature_dim = self.backbone.feature_info.channels()[-1]
+        self.num_patches = feature_size[0] * feature_size[1]
+        self.proj = nn.Linear(feature_dim, embed_dim)
+
+    def forward(self, x):
+        x = self.backbone(x)[-1]
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+
+
+class ViT(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 model_name='vit_base_patch16_224',
+                 img_size=384,
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=1024,
+                 depth=24,
+                 num_heads=16,
+                 num_classes=19,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.1,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 hybrid_backbone=None,
+                 norm_layer=partial(nn.LayerNorm, eps=1e-6),
+                 norm_cfg=None,
+                 pos_embed_interp=False,
+                 random_init=False,
+                 align_corners=False,
+                 use_checkpoint=False,
+                 num_extra_tokens=1,
+                 out_features=None,
+                 **kwargs,
+                 ):
+
+        super(ViT, self).__init__()
+        self.model_name = model_name
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.num_classes = num_classes
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.hybrid_backbone = hybrid_backbone
+        self.norm_layer = norm_layer
+        self.norm_cfg = norm_cfg
+        self.pos_embed_interp = pos_embed_interp
+        self.random_init = random_init
+        self.align_corners = align_corners
+        self.use_checkpoint = use_checkpoint
+        self.num_extra_tokens = num_extra_tokens
+        self.out_features = out_features
+        self.out_indices = [int(name[5:]) for name in out_features]
+
+        # self.num_stages = self.depth
+        # self.out_indices = tuple(range(self.num_stages))
+
+        if self.hybrid_backbone is not None:
+            self.patch_embed = HybridEmbed(
+                self.hybrid_backbone, img_size=self.img_size, in_chans=self.in_chans, embed_dim=self.embed_dim)
+        else:
+            self.patch_embed = PatchEmbed(
+                img_size=self.img_size, patch_size=self.patch_size, in_chans=self.in_chans, embed_dim=self.embed_dim)
+        self.num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+
+        if self.num_extra_tokens == 2:
+            self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+
+        self.pos_embed = nn.Parameter(torch.zeros(
+            1, self.num_patches + self.num_extra_tokens, self.embed_dim))
+        self.pos_drop = nn.Dropout(p=self.drop_rate)
+
+        # self.num_extra_tokens = self.pos_embed.shape[-2] - self.num_patches
+        dpr = [x.item() for x in torch.linspace(0, self.drop_path_rate,
+                                                self.depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=self.embed_dim, num_heads=self.num_heads, mlp_ratio=self.mlp_ratio, qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                drop=self.drop_rate, attn_drop=self.attn_drop_rate, drop_path=dpr[i], norm_layer=self.norm_layer)
+            for i in range(self.depth)])
+
+        # NOTE as per official impl, we could have a pre-logits representation dense layer + tanh here
+        # self.repr = nn.Linear(embed_dim, representation_size)
+        # self.repr_act = nn.Tanh()
+
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.SyncBatchNorm(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn3 = nn.Identity()
+
+            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Identity()
+
+            self.fpn3 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=2, stride=2),
+            )
+
+            self.fpn4 = nn.Sequential(
+                nn.MaxPool2d(kernel_size=4, stride=4),
+            )
+
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        if self.num_extra_tokens==2:
+            trunc_normal_(self.dist_token, std=0.2)
+        self.apply(self._init_weights)
+        # self.fix_init_weight()
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    '''
+    def init_weights(self):
+        logger = get_root_logger()
+
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+        if self.init_cfg is None:
+            logger.warn(f'No pre-trained weights for '
+                        f'{self.__class__.__name__}, '
+                        f'training start from scratch')
+        else:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            logger.info(f"Will load ckpt from {self.init_cfg['checkpoint']}")
+            load_checkpoint(self, filename=self.init_cfg['checkpoint'], strict=False, logger=logger)
+    '''
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def _conv_filter(self, state_dict, patch_size=16):
+        """ convert patch embedding weight from manual patchify + linear proj to conv"""
+        out_dict = {}
+        for k, v in state_dict.items():
+            if 'patch_embed.proj.weight' in k:
+                v = v.reshape((v.shape[0], 3, patch_size, patch_size))
+            out_dict[k] = v
+        return out_dict
+
+    def to_2D(self, x):
+        n, hw, c = x.shape
+        h = w = int(math.sqrt(hw))
+        x = x.transpose(1, 2).reshape(n, c, h, w)
+        return x
+
+    def to_1D(self, x):
+        n, c, h, w = x.shape
+        x = x.reshape(n, c, -1).transpose(1, 2)
+        return x
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - self.num_extra_tokens
+        N = self.pos_embed.shape[1] - self.num_extra_tokens
+        if npatch == N and w == h:
+            return self.pos_embed
+
+        class_ORdist_pos_embed = self.pos_embed[:, 0:self.num_extra_tokens]
+
+        patch_pos_embed = self.pos_embed[:, self.num_extra_tokens:]
+
+        dim = x.shape[-1]
+        w0 = w // self.patch_embed.patch_size[0]
+        h0 = h // self.patch_embed.patch_size[1]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+        assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_ORdist_pos_embed, patch_pos_embed), dim=1)
+
+    def prepare_tokens(self, x, mask=None):
+        B, nc, w, h = x.shape
+        # patch linear embedding
+        x = self.patch_embed(x)
+
+        # mask image modeling
+        if mask is not None:
+            x = self.mask_model(x, mask)
+        x = x.flatten(2).transpose(1, 2)
+
+        # add the [CLS] token to the embed patch tokens
+        all_tokens = [self.cls_token.expand(B, -1, -1)]
+
+        if self.num_extra_tokens == 2:
+            dist_tokens = self.dist_token.expand(B, -1, -1)
+            all_tokens.append(dist_tokens)
+        all_tokens.append(x)
+
+        x = torch.cat(all_tokens, dim=1)
+
+        # add positional encoding to each token
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        return self.pos_drop(x)
+
+    def forward_features(self, x):
+        # print(f"==========shape of x is {x.shape}==========")
+        B, _, H, W = x.shape
+        Hp, Wp = H // self.patch_size, W // self.patch_size
+        x = self.prepare_tokens(x)
+
+        features = []
+        for i, blk in enumerate(self.blocks):
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+            if i in self.out_indices:
+                xp = x[:, self.num_extra_tokens:, :].permute(0, 2, 1).reshape(B, -1, Hp, Wp)
+                features.append(xp.contiguous())
+
+        ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+        for i in range(len(features)):
+            features[i] = ops[i](features[i])
+
+        feat_out = {}
+
+        for name, value in zip(self.out_features, features):
+            feat_out[name] = value
+
+        return feat_out
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        return x
+
+
+def deit_base_patch16(pretrained=False, **kwargs):
+    model = ViT(
+        patch_size=16,
+        drop_rate=0.,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        num_classes=1000,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        use_checkpoint=True,
+        num_extra_tokens=2,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
+
+def mae_base_patch16(pretrained=False, **kwargs):
+    model = ViT(
+        patch_size=16,
+        drop_rate=0.,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        num_classes=1000,
+        mlp_ratio=4.,
+        qkv_bias=True,
+        use_checkpoint=True,
+        num_extra_tokens=1,
+        **kwargs)
+    model.default_cfg = _cfg()
+    return model
\ No newline at end of file
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd997b55f3118a01f5d49ae2f080525c7d7c9534
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py
@@ -0,0 +1,7 @@
+from .models import (
+    LayoutLMv3Config,
+    LayoutLMv3ForTokenClassification,
+    LayoutLMv3ForQuestionAnswering,
+    LayoutLMv3ForSequenceClassification,
+    LayoutLMv3Tokenizer,
+)
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bcec6c7c65b7add5c3440f106b8f1049781167a
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from .data_collator import DataCollatorForKeyValueExtraction
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py
new file mode 100644
index 0000000000000000000000000000000000000000..820dc53a4ea8bc79ddac2d36b57ea2110e8d27d5
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py
@@ -0,0 +1,171 @@
+'''
+Reference: https://huggingface.co/datasets/pierresi/cord/blob/main/cord.py
+'''
+
+
+import json
+import os
+from pathlib import Path
+import datasets
+from .image_utils import load_image, normalize_bbox
+logger = datasets.logging.get_logger(__name__)
+_CITATION = """\
+@article{park2019cord,
+  title={CORD: A Consolidated Receipt Dataset for Post-OCR Parsing},
+  author={Park, Seunghyun and Shin, Seung and Lee, Bado and Lee, Junyeop and Surh, Jaeheung and Seo, Minjoon and Lee, Hwalsuk}
+  booktitle={Document Intelligence Workshop at Neural Information Processing Systems}
+  year={2019}
+}
+"""
+_DESCRIPTION = """\
+https://github.com/clovaai/cord/
+"""
+
+def quad_to_box(quad):
+    # test 87 is wrongly annotated
+    box = (
+        max(0, quad["x1"]),
+        max(0, quad["y1"]),
+        quad["x3"],
+        quad["y3"]
+    )
+    if box[3] < box[1]:
+        bbox = list(box)
+        tmp = bbox[3]
+        bbox[3] = bbox[1]
+        bbox[1] = tmp
+        box = tuple(bbox)
+    if box[2] < box[0]:
+        bbox = list(box)
+        tmp = bbox[2]
+        bbox[2] = bbox[0]
+        bbox[0] = tmp
+        box = tuple(bbox)
+    return box
+
+def _get_drive_url(url):
+    base_url = 'https://drive.google.com/uc?id='
+    split_url = url.split('/')
+    return base_url + split_url[5]
+
+_URLS = [
+    _get_drive_url("https://drive.google.com/file/d/1MqhTbcj-AHXOqYoeoh12aRUwIprzTJYI/"),
+    _get_drive_url("https://drive.google.com/file/d/1wYdp5nC9LnHQZ2FcmOoC0eClyWvcuARU/")
+    # If you failed to download the dataset through the automatic downloader,
+    # you can download it manually and modify the code to get the local dataset.
+    # Or you can use the following links. Please follow the original LICENSE of CORD for usage.
+    # "https://layoutlm.blob.core.windows.net/cord/CORD-1k-001.zip",
+    # "https://layoutlm.blob.core.windows.net/cord/CORD-1k-002.zip"
+]
+
+class CordConfig(datasets.BuilderConfig):
+    """BuilderConfig for CORD"""
+    def __init__(self, **kwargs):
+        """BuilderConfig for CORD.
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(CordConfig, self).__init__(**kwargs)
+
+class Cord(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        CordConfig(name="cord", version=datasets.Version("1.0.0"), description="CORD dataset"),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "words": datasets.Sequence(datasets.Value("string")),
+                    "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
+                    "ner_tags": datasets.Sequence(
+                        datasets.features.ClassLabel(
+                            names=["O","B-MENU.NM","B-MENU.NUM","B-MENU.UNITPRICE","B-MENU.CNT","B-MENU.DISCOUNTPRICE","B-MENU.PRICE","B-MENU.ITEMSUBTOTAL","B-MENU.VATYN","B-MENU.ETC","B-MENU.SUB_NM","B-MENU.SUB_UNITPRICE","B-MENU.SUB_CNT","B-MENU.SUB_PRICE","B-MENU.SUB_ETC","B-VOID_MENU.NM","B-VOID_MENU.PRICE","B-SUB_TOTAL.SUBTOTAL_PRICE","B-SUB_TOTAL.DISCOUNT_PRICE","B-SUB_TOTAL.SERVICE_PRICE","B-SUB_TOTAL.OTHERSVC_PRICE","B-SUB_TOTAL.TAX_PRICE","B-SUB_TOTAL.ETC","B-TOTAL.TOTAL_PRICE","B-TOTAL.TOTAL_ETC","B-TOTAL.CASHPRICE","B-TOTAL.CHANGEPRICE","B-TOTAL.CREDITCARDPRICE","B-TOTAL.EMONEYPRICE","B-TOTAL.MENUTYPE_CNT","B-TOTAL.MENUQTY_CNT","I-MENU.NM","I-MENU.NUM","I-MENU.UNITPRICE","I-MENU.CNT","I-MENU.DISCOUNTPRICE","I-MENU.PRICE","I-MENU.ITEMSUBTOTAL","I-MENU.VATYN","I-MENU.ETC","I-MENU.SUB_NM","I-MENU.SUB_UNITPRICE","I-MENU.SUB_CNT","I-MENU.SUB_PRICE","I-MENU.SUB_ETC","I-VOID_MENU.NM","I-VOID_MENU.PRICE","I-SUB_TOTAL.SUBTOTAL_PRICE","I-SUB_TOTAL.DISCOUNT_PRICE","I-SUB_TOTAL.SERVICE_PRICE","I-SUB_TOTAL.OTHERSVC_PRICE","I-SUB_TOTAL.TAX_PRICE","I-SUB_TOTAL.ETC","I-TOTAL.TOTAL_PRICE","I-TOTAL.TOTAL_ETC","I-TOTAL.CASHPRICE","I-TOTAL.CHANGEPRICE","I-TOTAL.CREDITCARDPRICE","I-TOTAL.EMONEYPRICE","I-TOTAL.MENUTYPE_CNT","I-TOTAL.MENUQTY_CNT"]
+                        )
+                    ),
+                    "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
+                    "image_path": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            citation=_CITATION,
+            homepage="https://github.com/clovaai/cord/",
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        """Uses local files located with data_dir"""
+        downloaded_file = dl_manager.download_and_extract(_URLS)
+        # move files from the second URL together with files from the first one.
+        dest = Path(downloaded_file[0])/"CORD"
+        for split in ["train", "dev", "test"]:
+            for file_type in ["image", "json"]:
+                if split == "test" and file_type == "json":
+                    continue
+                files = (Path(downloaded_file[1])/"CORD"/split/file_type).iterdir()
+                for f in files:
+                    os.rename(f, dest/split/file_type/f.name)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"filepath": dest/"train"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION, gen_kwargs={"filepath": dest/"dev"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"filepath": dest/"test"}
+            ),
+        ]
+
+    def get_line_bbox(self, bboxs):
+        x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)]
+        y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)]
+
+        x0, y0, x1, y1 = min(x), min(y), max(x), max(y)
+
+        assert x1 >= x0 and y1 >= y0
+        bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))]
+        return bbox
+
+    def _generate_examples(self, filepath):
+        logger.info("⏳ Generating examples from = %s", filepath)
+        ann_dir = os.path.join(filepath, "json")
+        img_dir = os.path.join(filepath, "image")
+        for guid, file in enumerate(sorted(os.listdir(ann_dir))):
+            words = []
+            bboxes = []
+            ner_tags = []
+            file_path = os.path.join(ann_dir, file)
+            with open(file_path, "r", encoding="utf8") as f:
+                data = json.load(f)
+            image_path = os.path.join(img_dir, file)
+            image_path = image_path.replace("json", "png")
+            image, size = load_image(image_path)
+            for item in data["valid_line"]:
+                cur_line_bboxes = []
+                line_words, label = item["words"], item["category"]
+                line_words = [w for w in line_words if w["text"].strip() != ""]
+                if len(line_words) == 0:
+                    continue
+                if label == "other":
+                    for w in line_words:
+                        words.append(w["text"])
+                        ner_tags.append("O")
+                        cur_line_bboxes.append(normalize_bbox(quad_to_box(w["quad"]), size))
+                else:
+                    words.append(line_words[0]["text"])
+                    ner_tags.append("B-" + label.upper())
+                    cur_line_bboxes.append(normalize_bbox(quad_to_box(line_words[0]["quad"]), size))
+                    for w in line_words[1:]:
+                        words.append(w["text"])
+                        ner_tags.append("I-" + label.upper())
+                        cur_line_bboxes.append(normalize_bbox(quad_to_box(w["quad"]), size))
+                # by default: --segment_level_layout 1
+                # if do not want to use segment_level_layout, comment the following line
+                cur_line_bboxes = self.get_line_bbox(cur_line_bboxes)
+                bboxes.extend(cur_line_bboxes)
+            # yield guid, {"id": str(guid), "words": words, "bboxes": bboxes, "ner_tags": ner_tags, "image": image}
+            yield guid, {"id": str(guid), "words": words, "bboxes": bboxes, "ner_tags": ner_tags,
+                         "image": image, "image_path": image_path}
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..4232a6660ba2678ba20be5479629550419a798b4
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py
@@ -0,0 +1,124 @@
+import torch
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from transformers import BatchEncoding, PreTrainedTokenizerBase
+from transformers.data.data_collator import (
+    DataCollatorMixin,
+    _torch_collate_batch,
+)
+from transformers.file_utils import PaddingStrategy
+
+from typing import NewType
+InputDataClass = NewType("InputDataClass", Any)
+
+def pre_calc_rel_mat(segment_ids):
+    valid_span = torch.zeros((segment_ids.shape[0], segment_ids.shape[1], segment_ids.shape[1]),
+                             device=segment_ids.device, dtype=torch.bool)
+    for i in range(segment_ids.shape[0]):
+        for j in range(segment_ids.shape[1]):
+            valid_span[i, j, :] = segment_ids[i, :] == segment_ids[i, j]
+
+    return valid_span
+
+@dataclass
+class DataCollatorForKeyValueExtraction(DataCollatorMixin):
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+
+    def __call__(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+
+        images = None
+        if "images" in features[0]:
+            images = torch.stack([torch.tensor(d.pop("images")) for d in features])
+            IMAGE_LEN = int(images.shape[-1] / 16) * int(images.shape[-1] / 16) + 1
+
+        batch = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="pt" if labels is None else None,
+        )
+
+        if images is not None:
+            batch["images"] = images
+            batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) and k == 'attention_mask' else v
+                     for k, v in batch.items()}
+            visual_attention_mask = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long)
+            batch["attention_mask"] = torch.cat([batch['attention_mask'], visual_attention_mask], dim=1)
+
+        if labels is None:
+            return batch
+
+        has_bbox_input = "bbox" in features[0]
+        has_position_input = "position_ids" in features[0]
+        padding_idx=self.tokenizer.pad_token_id
+        sequence_length = torch.tensor(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
+            if has_bbox_input:
+                batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]]
+            if has_position_input:
+                batch["position_ids"] = [position_id + [padding_idx] * (sequence_length - len(position_id))
+                                          for position_id in batch["position_ids"]]
+
+        else:
+            batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
+            if has_bbox_input:
+                batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]]
+            if has_position_input:
+                batch["position_ids"] = [[padding_idx] * (sequence_length - len(position_id))
+                                          + position_id for position_id in batch["position_ids"]]
+
+        if 'segment_ids' in batch:
+            assert 'position_ids' in batch
+            for i in range(len(batch['segment_ids'])):
+                batch['segment_ids'][i] = batch['segment_ids'][i] + [batch['segment_ids'][i][-1] + 1] * (sequence_length - len(batch['segment_ids'][i])) + [
+                    batch['segment_ids'][i][-1] + 2] * IMAGE_LEN
+
+        batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()}
+
+        if 'segment_ids' in batch:
+            valid_span = pre_calc_rel_mat(
+                segment_ids=batch['segment_ids']
+            )
+            batch['valid_span'] = valid_span
+            del batch['segment_ids']
+
+        if images is not None:
+            visual_labels = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) * -100
+            batch["labels"] = torch.cat([batch['labels'], visual_labels], dim=1)
+
+        return batch
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f34042023042b10d52906d4ba5ca9c87e65a600
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+'''
+Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py
+'''
+import json
+import os
+
+import datasets
+
+from .image_utils import load_image, normalize_bbox
+
+
+logger = datasets.logging.get_logger(__name__)
+
+
+_CITATION = """\
+@article{Jaume2019FUNSDAD,
+  title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
+  author={Guillaume Jaume and H. K. Ekenel and J. Thiran},
+  journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)},
+  year={2019},
+  volume={2},
+  pages={1-6}
+}
+"""
+
+_DESCRIPTION = """\
+https://guillaumejaume.github.io/FUNSD/
+"""
+
+
+class FunsdConfig(datasets.BuilderConfig):
+    """BuilderConfig for FUNSD"""
+
+    def __init__(self, **kwargs):
+        """BuilderConfig for FUNSD.
+
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(FunsdConfig, self).__init__(**kwargs)
+
+
+class Funsd(datasets.GeneratorBasedBuilder):
+    """Conll2003 dataset."""
+
+    BUILDER_CONFIGS = [
+        FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"),
+    ]
+
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "tokens": datasets.Sequence(datasets.Value("string")),
+                    "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
+                    "ner_tags": datasets.Sequence(
+                        datasets.features.ClassLabel(
+                            names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
+                        )
+                    ),
+                    "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
+                    "image_path": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://guillaumejaume.github.io/FUNSD/",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip")
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"}
+            ),
+        ]
+
+    def get_line_bbox(self, bboxs):
+        x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)]
+        y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)]
+
+        x0, y0, x1, y1 = min(x), min(y), max(x), max(y)
+
+        assert x1 >= x0 and y1 >= y0
+        bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))]
+        return bbox
+
+    def _generate_examples(self, filepath):
+        logger.info("⏳ Generating examples from = %s", filepath)
+        ann_dir = os.path.join(filepath, "annotations")
+        img_dir = os.path.join(filepath, "images")
+        for guid, file in enumerate(sorted(os.listdir(ann_dir))):
+            tokens = []
+            bboxes = []
+            ner_tags = []
+
+            file_path = os.path.join(ann_dir, file)
+            with open(file_path, "r", encoding="utf8") as f:
+                data = json.load(f)
+            image_path = os.path.join(img_dir, file)
+            image_path = image_path.replace("json", "png")
+            image, size = load_image(image_path)
+            for item in data["form"]:
+                cur_line_bboxes = []
+                words, label = item["words"], item["label"]
+                words = [w for w in words if w["text"].strip() != ""]
+                if len(words) == 0:
+                    continue
+                if label == "other":
+                    for w in words:
+                        tokens.append(w["text"])
+                        ner_tags.append("O")
+                        cur_line_bboxes.append(normalize_bbox(w["box"], size))
+                else:
+                    tokens.append(words[0]["text"])
+                    ner_tags.append("B-" + label.upper())
+                    cur_line_bboxes.append(normalize_bbox(words[0]["box"], size))
+                    for w in words[1:]:
+                        tokens.append(w["text"])
+                        ner_tags.append("I-" + label.upper())
+                        cur_line_bboxes.append(normalize_bbox(w["box"], size))
+                # by default: --segment_level_layout 1
+                # if do not want to use segment_level_layout, comment the following line
+                cur_line_bboxes = self.get_line_bbox(cur_line_bboxes)
+                # box = normalize_bbox(item["box"], size)
+                # cur_line_bboxes = [box for _ in range(len(words))]
+                bboxes.extend(cur_line_bboxes)
+            yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags,
+                         "image": image, "image_path": image_path}
\ No newline at end of file
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a4b34373980246d6397b95b91e84461f3f2580
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py
@@ -0,0 +1,284 @@
+import torchvision.transforms.functional as F
+import warnings
+import math
+import random
+import numpy as np
+from PIL import Image
+import torch
+
+from detectron2.data.detection_utils import read_image
+from detectron2.data.transforms import ResizeTransform, TransformList
+
+def normalize_bbox(bbox, size):
+    return [
+        int(1000 * bbox[0] / size[0]),
+        int(1000 * bbox[1] / size[1]),
+        int(1000 * bbox[2] / size[0]),
+        int(1000 * bbox[3] / size[1]),
+    ]
+
+
+def load_image(image_path):
+    image = read_image(image_path, format="BGR")
+    h = image.shape[0]
+    w = image.shape[1]
+    img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)])
+    image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1)  # copy to make it writeable
+    return image, (w, h)
+
+
+def crop(image, i, j, h, w, boxes=None):
+    cropped_image = F.crop(image, i, j, h, w)
+
+    if boxes is not None:
+        # Currently we cannot use this case since when some boxes is out of the cropped image,
+        # it may be better to drop out these boxes along with their text input (instead of min or clamp)
+        # which haven't been implemented here
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = torch.as_tensor(boxes) - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        boxes = cropped_boxes.reshape(-1, 4)
+
+    return cropped_image, boxes
+
+
+def resize(image, size, interpolation, boxes=None):
+    # It seems that we do not need to resize boxes here, since the boxes will be resized to 1000x1000 finally,
+    # which is compatible with a square image size of 224x224
+    rescaled_image = F.resize(image, size, interpolation)
+
+    if boxes is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    # boxes = boxes.copy()
+    scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+
+    return rescaled_image, scaled_boxes
+
+
+def clamp(num, min_value, max_value):
+    return max(min(num, max_value), min_value)
+
+
+def get_bb(bb, page_size):
+    bbs = [float(j) for j in bb]
+    xs, ys = [], []
+    for i, b in enumerate(bbs):
+        if i % 2 == 0:
+            xs.append(b)
+        else:
+            ys.append(b)
+    (width, height) = page_size
+    return_bb = [
+        clamp(min(xs), 0, width - 1),
+        clamp(min(ys), 0, height - 1),
+        clamp(max(xs), 0, width - 1),
+        clamp(max(ys), 0, height - 1),
+    ]
+    return_bb = [
+            int(1000 * return_bb[0] / width),
+            int(1000 * return_bb[1] / height),
+            int(1000 * return_bb[2] / width),
+            int(1000 * return_bb[3] / height),
+        ]
+    return return_bb
+
+
+class ToNumpy:
+
+    def __call__(self, pil_img):
+        np_img = np.array(pil_img, dtype=np.uint8)
+        if np_img.ndim < 3:
+            np_img = np.expand_dims(np_img, axis=-1)
+        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
+        return np_img
+
+
+class ToTensor:
+
+    def __init__(self, dtype=torch.float32):
+        self.dtype = dtype
+
+    def __call__(self, pil_img):
+        np_img = np.array(pil_img, dtype=np.uint8)
+        if np_img.ndim < 3:
+            np_img = np.expand_dims(np_img, axis=-1)
+        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
+        return torch.from_numpy(np_img).to(dtype=self.dtype)
+
+
+_pil_interpolation_to_str = {
+    F.InterpolationMode.NEAREST: 'F.InterpolationMode.NEAREST',
+    F.InterpolationMode.BILINEAR: 'F.InterpolationMode.BILINEAR',
+    F.InterpolationMode.BICUBIC: 'F.InterpolationMode.BICUBIC',
+    F.InterpolationMode.LANCZOS: 'F.InterpolationMode.LANCZOS',
+    F.InterpolationMode.HAMMING: 'F.InterpolationMode.HAMMING',
+    F.InterpolationMode.BOX: 'F.InterpolationMode.BOX',
+}
+
+
+def _pil_interp(method):
+    if method == 'bicubic':
+        return F.InterpolationMode.BICUBIC
+    elif method == 'lanczos':
+        return F.InterpolationMode.LANCZOS
+    elif method == 'hamming':
+        return F.InterpolationMode.HAMMING
+    else:
+        # default bilinear, do we want to allow nearest?
+        return F.InterpolationMode.BILINEAR
+
+
+class Compose:
+    """Composes several transforms together. This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, augmentation=False, box=None):
+        for t in self.transforms:
+            img = t(img, augmentation, box)
+        return img
+
+
+class RandomResizedCropAndInterpolationWithTwoPic:
+    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: PIL.Image.BILINEAR
+    """
+
+    def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear', second_interpolation='lanczos'):
+        if isinstance(size, tuple):
+            self.size = size
+        else:
+            self.size = (size, size)
+        if second_size is not None:
+            if isinstance(second_size, tuple):
+                self.second_size = second_size
+            else:
+                self.second_size = (second_size, second_size)
+        else:
+            self.second_size = None
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("range should be of kind (min, max)")
+
+        self.interpolation = _pil_interp(interpolation)
+        self.second_interpolation = _pil_interp(second_interpolation)
+        self.scale = scale
+        self.ratio = ratio
+
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        area = img.size[0] * img.size[1]
+
+        for attempt in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if w <= img.size[0] and h <= img.size[1]:
+                i = random.randint(0, img.size[1] - h)
+                j = random.randint(0, img.size[0] - w)
+                return i, j, h, w
+
+        # Fallback to central crop
+        in_ratio = img.size[0] / img.size[1]
+        if in_ratio < min(ratio):
+            w = img.size[0]
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = img.size[1]
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = img.size[0]
+            h = img.size[1]
+        i = (img.size[1] - h) // 2
+        j = (img.size[0] - w) // 2
+        return i, j, h, w
+
+    def __call__(self, img, augmentation=False, box=None):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Randomly cropped and resized image.
+        """
+        if augmentation:
+            i, j, h, w = self.get_params(img, self.scale, self.ratio)
+            img = F.crop(img, i, j, h, w)
+            # img, box = crop(img, i, j, h, w, box)
+        img = F.resize(img, self.size, self.interpolation)
+        second_img = F.resize(img, self.second_size, self.second_interpolation) \
+            if self.second_size is not None else None
+        return img, second_img
+
+    def __repr__(self):
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation])
+        else:
+            interpolate_str = _pil_interpolation_to_str[self.interpolation]
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale))
+        format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio))
+        format_string += ', interpolation={0}'.format(interpolate_str)
+        if self.second_size is not None:
+            format_string += ', second_size={0}'.format(self.second_size)
+            format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation])
+        format_string += ')'
+        return format_string
+
+
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py
new file mode 100644
index 0000000000000000000000000000000000000000..7749ba5dd1d59a4e0c5baf4f2c27cffaae3e4e12
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py
@@ -0,0 +1,213 @@
+import os
+import json
+
+import torch
+from torch.utils.data.dataset import Dataset
+from torchvision import transforms
+from PIL import Image
+
+from .image_utils import Compose, RandomResizedCropAndInterpolationWithTwoPic
+
+XFund_label2ids = {
+    "O":0,
+    'B-HEADER':1,
+    'I-HEADER':2,
+    'B-QUESTION':3,
+    'I-QUESTION':4,
+    'B-ANSWER':5,
+    'I-ANSWER':6,
+}
+
+class xfund_dataset(Dataset):
+    def box_norm(self, box, width, height):
+        def clip(min_num, num, max_num):
+            return min(max(num, min_num), max_num)
+
+        x0, y0, x1, y1 = box
+        x0 = clip(0, int((x0 / width) * 1000), 1000)
+        y0 = clip(0, int((y0 / height) * 1000), 1000)
+        x1 = clip(0, int((x1 / width) * 1000), 1000)
+        y1 = clip(0, int((y1 / height) * 1000), 1000)
+        assert x1 >= x0
+        assert y1 >= y0
+        return [x0, y0, x1, y1]
+
+    def get_segment_ids(self, bboxs):
+        segment_ids = []
+        for i in range(len(bboxs)):
+            if i == 0:
+                segment_ids.append(0)
+            else:
+                if bboxs[i - 1] == bboxs[i]:
+                    segment_ids.append(segment_ids[-1])
+                else:
+                    segment_ids.append(segment_ids[-1] + 1)
+        return segment_ids
+
+    def get_position_ids(self, segment_ids):
+        position_ids = []
+        for i in range(len(segment_ids)):
+            if i == 0:
+                position_ids.append(2)
+            else:
+                if segment_ids[i] == segment_ids[i - 1]:
+                    position_ids.append(position_ids[-1] + 1)
+                else:
+                    position_ids.append(2)
+        return position_ids
+
+    def load_data(
+            self,
+            data_file,
+    ):
+        # re-org data format
+        total_data = {"id": [], "lines": [], "bboxes": [], "ner_tags": [], "image_path": []}
+        for i in range(len(data_file['documents'])):
+            width, height = data_file['documents'][i]['img']['width'], data_file['documents'][i]['img'][
+                'height']
+
+            cur_doc_lines, cur_doc_bboxes, cur_doc_ner_tags, cur_doc_image_path = [], [], [], []
+            for j in range(len(data_file['documents'][i]['document'])):
+                cur_item = data_file['documents'][i]['document'][j]
+                cur_doc_lines.append(cur_item['text'])
+                cur_doc_bboxes.append(self.box_norm(cur_item['box'], width=width, height=height))
+                cur_doc_ner_tags.append(cur_item['label'])
+            total_data['id'] += [len(total_data['id'])]
+            total_data['lines'] += [cur_doc_lines]
+            total_data['bboxes'] += [cur_doc_bboxes]
+            total_data['ner_tags'] += [cur_doc_ner_tags]
+            total_data['image_path'] += [data_file['documents'][i]['img']['fname']]
+
+        # tokenize text and get bbox/label
+        total_input_ids, total_bboxs, total_label_ids = [], [], []
+        for i in range(len(total_data['lines'])):
+            cur_doc_input_ids, cur_doc_bboxs, cur_doc_labels = [], [], []
+            for j in range(len(total_data['lines'][i])):
+                cur_input_ids = self.tokenizer(total_data['lines'][i][j], truncation=False, add_special_tokens=False, return_attention_mask=False)['input_ids']
+                if len(cur_input_ids) == 0: continue
+
+                cur_label = total_data['ner_tags'][i][j].upper()
+                if cur_label == 'OTHER':
+                    cur_labels = ["O"] * len(cur_input_ids)
+                    for k in range(len(cur_labels)):
+                        cur_labels[k] = self.label2ids[cur_labels[k]]
+                else:
+                    cur_labels = [cur_label] * len(cur_input_ids)
+                    cur_labels[0] = self.label2ids['B-' + cur_labels[0]]
+                    for k in range(1, len(cur_labels)):
+                        cur_labels[k] = self.label2ids['I-' + cur_labels[k]]
+                assert len(cur_input_ids) == len([total_data['bboxes'][i][j]] * len(cur_input_ids)) == len(cur_labels)
+                cur_doc_input_ids += cur_input_ids
+                cur_doc_bboxs += [total_data['bboxes'][i][j]] * len(cur_input_ids)
+                cur_doc_labels += cur_labels
+            assert len(cur_doc_input_ids) == len(cur_doc_bboxs) == len(cur_doc_labels)
+            assert len(cur_doc_input_ids) > 0
+
+            total_input_ids.append(cur_doc_input_ids)
+            total_bboxs.append(cur_doc_bboxs)
+            total_label_ids.append(cur_doc_labels)
+        assert len(total_input_ids) == len(total_bboxs) == len(total_label_ids)
+
+        # split text to several slices because of over-length
+        input_ids, bboxs, labels = [], [], []
+        segment_ids, position_ids = [], []
+        image_path = []
+        for i in range(len(total_input_ids)):
+            start = 0
+            cur_iter = 0
+            while start < len(total_input_ids[i]):
+                end = min(start + 510, len(total_input_ids[i]))
+
+                input_ids.append([self.tokenizer.cls_token_id] + total_input_ids[i][start: end] + [self.tokenizer.sep_token_id])
+                bboxs.append([[0, 0, 0, 0]] + total_bboxs[i][start: end] + [[1000, 1000, 1000, 1000]])
+                labels.append([-100] + total_label_ids[i][start: end] + [-100])
+
+                cur_segment_ids = self.get_segment_ids(bboxs[-1])
+                cur_position_ids = self.get_position_ids(cur_segment_ids)
+                segment_ids.append(cur_segment_ids)
+                position_ids.append(cur_position_ids)
+                image_path.append(os.path.join(self.args.data_dir, "images", total_data['image_path'][i]))
+
+                start = end
+                cur_iter += 1
+
+        assert len(input_ids) == len(bboxs) == len(labels) == len(segment_ids) == len(position_ids)
+        assert len(segment_ids) == len(image_path)
+
+        res = {
+            'input_ids': input_ids,
+            'bbox': bboxs,
+            'labels': labels,
+            'segment_ids': segment_ids,
+            'position_ids': position_ids,
+            'image_path': image_path,
+        }
+        return res
+
+    def __init__(
+            self,
+            args,
+            tokenizer,
+            mode
+    ):
+        self.args = args
+        self.mode = mode
+        self.cur_la = args.language
+        self.tokenizer = tokenizer
+        self.label2ids = XFund_label2ids
+
+
+        self.common_transform = Compose([
+            RandomResizedCropAndInterpolationWithTwoPic(
+                size=args.input_size, interpolation=args.train_interpolation,
+            ),
+        ])
+
+        self.patch_transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(
+                mean=torch.tensor((0.5, 0.5, 0.5)),
+                std=torch.tensor((0.5, 0.5, 0.5)))
+        ])
+
+        data_file = json.load(
+            open(os.path.join(args.data_dir, "{}.{}.json".format(self.cur_la, 'train' if mode == 'train' else 'val')),
+                 'r'))
+
+        self.feature = self.load_data(data_file)
+
+    def __len__(self):
+        return len(self.feature['input_ids'])
+
+    def __getitem__(self, index):
+        input_ids = self.feature["input_ids"][index]
+
+        # attention_mask = self.feature["attention_mask"][index]
+        attention_mask = [1] * len(input_ids)
+        labels = self.feature["labels"][index]
+        bbox = self.feature["bbox"][index]
+        segment_ids = self.feature['segment_ids'][index]
+        position_ids = self.feature['position_ids'][index]
+
+        img = pil_loader(self.feature['image_path'][index])
+        for_patches, _ = self.common_transform(img, augmentation=False)
+        patch = self.patch_transform(for_patches)
+
+        assert len(input_ids) == len(attention_mask) == len(labels) == len(bbox) == len(segment_ids)
+
+        res = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "bbox": bbox,
+            "segment_ids": segment_ids,
+            "position_ids": position_ids,
+            "images": patch,
+        }
+        return res
+
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
\ No newline at end of file
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b3100effb34547bbaba7503288db34374cad9ca
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py
@@ -0,0 +1,7 @@
+from .layoutlmv3 import (
+    LayoutLMv3Config,
+    LayoutLMv3ForTokenClassification,
+    LayoutLMv3ForQuestionAnswering,
+    LayoutLMv3ForSequenceClassification,
+    LayoutLMv3Tokenizer,
+)
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e06a24b0ca9971cfe99dc9ef60ce8e495ff406bd
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py
@@ -0,0 +1,24 @@
+from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, \
+    AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer
+from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, RobertaConverter
+
+from .configuration_layoutlmv3 import LayoutLMv3Config
+from .modeling_layoutlmv3 import (
+    LayoutLMv3ForTokenClassification,
+    LayoutLMv3ForQuestionAnswering,
+    LayoutLMv3ForSequenceClassification,
+    LayoutLMv3Model,
+)
+from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
+from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast
+
+
+#AutoConfig.register("layoutlmv3", LayoutLMv3Config)
+#AutoModel.register(LayoutLMv3Config, LayoutLMv3Model)
+#AutoModelForTokenClassification.register(LayoutLMv3Config, LayoutLMv3ForTokenClassification)
+#AutoModelForQuestionAnswering.register(LayoutLMv3Config, LayoutLMv3ForQuestionAnswering)
+#AutoModelForSequenceClassification.register(LayoutLMv3Config, LayoutLMv3ForSequenceClassification)
+#AutoTokenizer.register(
+#    LayoutLMv3Config, slow_tokenizer_class=LayoutLMv3Tokenizer, fast_tokenizer_class=LayoutLMv3TokenizerFast
+#)
+SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv3Tokenizer": RobertaConverter})
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c7b4d71b4d51504dee8bc10e50ea91bac00270
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py
@@ -0,0 +1,60 @@
+# coding=utf-8
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
+    "layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/resolve/main/config.json",
+    # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3
+}
+
+
+class LayoutLMv3Config(BertConfig):
+    model_type = "layoutlmv3"
+
+    def __init__(
+        self,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        max_2d_position_embeddings=1024,
+        coordinate_size=None,
+        shape_size=None,
+        has_relative_attention_bias=False,
+        rel_pos_bins=32,
+        max_rel_pos=128,
+        has_spatial_attention_bias=False,
+        rel_2d_pos_bins=64,
+        max_rel_2d_pos=256,
+        visual_embed=True,
+        mim=False,
+        wpa_task=False,
+        discrete_vae_weight_path='',
+        discrete_vae_type='dall-e',
+        input_size=224,
+        second_input_size=112,
+        device='cuda',
+        **kwargs
+    ):
+        """Constructs RobertaConfig."""
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.coordinate_size = coordinate_size
+        self.shape_size = shape_size
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.rel_pos_bins = rel_pos_bins
+        self.max_rel_pos = max_rel_pos
+        self.has_spatial_attention_bias = has_spatial_attention_bias
+        self.rel_2d_pos_bins = rel_2d_pos_bins
+        self.max_rel_2d_pos = max_rel_2d_pos
+        self.visual_embed = visual_embed
+        self.mim = mim
+        self.wpa_task = wpa_task
+        self.discrete_vae_weight_path = discrete_vae_weight_path
+        self.discrete_vae_type = discrete_vae_type
+        self.input_size = input_size
+        self.second_input_size = second_input_size
+        self.device = device
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..113eb8eb1d123a4985c1894e0caab561b19f64c2
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py
@@ -0,0 +1,1282 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch LayoutLMv3 model. """
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from transformers import apply_chunking_to_forward
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    TokenClassifierOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.models.roberta.modeling_roberta import (
+    RobertaIntermediate,
+    RobertaLMHead,
+    RobertaOutput,
+    RobertaSelfOutput,
+)
+from transformers.utils import logging
+
+from .configuration_layoutlmv3 import LayoutLMv3Config
+from timm.models.layers import to_2tuple
+
+
+logger = logging.get_logger(__name__)
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        # The following variables are used in detection mycheckpointer.py
+        self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.num_patches_w = self.patch_shape[0]
+        self.num_patches_h = self.patch_shape[1]
+
+    def forward(self, x, position_embedding=None):
+        x = self.proj(x)
+
+        if position_embedding is not None:
+            # interpolate the position embedding to the corresponding size
+            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2)
+            Hp, Wp = x.shape[2], x.shape[3]
+            position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
+            x = x + position_embedding
+
+        x = x.flatten(2).transpose(1, 2)
+        return x
+
+class LayoutLMv3Embeddings(nn.Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
+        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
+        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
+
+    def _calc_spatial_position_embeddings(self, bbox):
+        try:
+            assert torch.all(0 <= bbox) and torch.all(bbox <= 1023)
+            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
+            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
+            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
+            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
+        except IndexError as e:
+            raise IndexError("The :obj:`bbox` coordinate values should be within 0-1000 range.") from e
+
+        h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
+        w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))
+
+        # below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add)
+        spatial_position_embeddings = torch.cat(
+            [
+                left_position_embeddings,
+                upper_position_embeddings,
+                right_position_embeddings,
+                lower_position_embeddings,
+                h_position_embeddings,
+                w_position_embeddings,
+            ],
+            dim=-1,
+        )
+        return spatial_position_embeddings
+
+    def create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+        are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: torch.Tensor x:
+
+        Returns: torch.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        token_type_ids=None,
+        position_ids=None,
+        inputs_embeds=None,
+        past_key_values_length=0,
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = self.create_position_ids_from_input_ids(
+                    input_ids, self.padding_idx, past_key_values_length).to(input_ids.device)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+
+        spatial_position_embeddings = self._calc_spatial_position_embeddings(bbox)
+
+        embeddings = embeddings + spatial_position_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor≈
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+class LayoutLMv3PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LayoutLMv3Config
+    base_model_prefix = "layoutlmv3"
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class LayoutLMv3SelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def cogview_attn(self, attention_scores, alpha=32):
+        '''
+        https://arxiv.org/pdf/2105.13290.pdf
+        Section 2.4 Stabilization of training: Precision Bottleneck Relaxation (PB-Relax).
+        A replacement of the original nn.Softmax(dim=-1)(attention_scores)
+        Seems the new attention_probs will result in a slower speed and a little bias
+        Can use torch.allclose(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison
+        The smaller atol (e.g., 1e-08), the better.
+        '''
+        scaled_attention_scores = attention_scores / alpha
+        max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
+        # max_value = scaled_attention_scores.amax(dim=(-2, -1)).unsqueeze(-1).unsqueeze(-1)
+        new_attention_scores = (scaled_attention_scores - max_value) * alpha
+        return nn.Softmax(dim=-1)(new_attention_scores)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        # The attention scores QT K/√d could be significantly larger than input elements, and result in overflow.
+        # Changing the computational order into QT(K/√d) alleviates the problem. (https://arxiv.org/pdf/2105.13290.pdf)
+        attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))
+
+        if self.has_relative_attention_bias and self.has_spatial_attention_bias:
+            attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
+        elif self.has_relative_attention_bias:
+            attention_scores += rel_pos / math.sqrt(self.attention_head_size)
+
+        # if self.has_relative_attention_bias:
+        #     attention_scores += rel_pos
+        # if self.has_spatial_attention_bias:
+        #     attention_scores += rel_2d_pos
+
+        # attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        # attention_probs = nn.Softmax(dim=-1)(attention_scores)  # comment the line below and use this line for speedup
+        attention_probs = self.cogview_attn(attention_scores)  # to stablize training
+        # assert torch.allclose(attention_probs, nn.Softmax(dim=-1)(attention_scores), atol=1e-8)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class LayoutLMv3Attention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = LayoutLMv3SelfAttention(config)
+        self.output = RobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class LayoutLMv3Layer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = LayoutLMv3Attention(config)
+        assert not config.is_decoder and not config.add_cross_attention, \
+            "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
+        self.intermediate = RobertaIntermediate(config)
+        self.output = RobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        rel_pos=None,
+        rel_2d_pos=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            rel_pos=rel_pos,
+            rel_2d_pos=rel_2d_pos,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class LayoutLMv3Encoder(nn.Module):
+    def __init__(self, config, detection=False, out_features=None):
+        super().__init__()
+        self.config = config
+        self.detection = detection
+        self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+        self.has_relative_attention_bias = config.has_relative_attention_bias
+        self.has_spatial_attention_bias = config.has_spatial_attention_bias
+
+        if self.has_relative_attention_bias:
+            self.rel_pos_bins = config.rel_pos_bins
+            self.max_rel_pos = config.max_rel_pos
+            self.rel_pos_onehot_size = config.rel_pos_bins
+            self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False)
+
+        if self.has_spatial_attention_bias:
+            self.max_rel_2d_pos = config.max_rel_2d_pos
+            self.rel_2d_pos_bins = config.rel_2d_pos_bins
+            self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
+            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
+            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
+
+        if self.detection:
+            self.gradient_checkpointing = True
+            embed_dim = self.config.hidden_size
+            self.out_features = out_features
+            self.out_indices = [int(name[5:]) for name in out_features]
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+                # nn.SyncBatchNorm(embed_dim),
+                nn.BatchNorm2d(embed_dim),
+                nn.GELU(),
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(
+                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
+            )
+
+            self.fpn3 = nn.Identity()
+
+            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+            self.ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+
+    def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
+        ret = 0
+        if bidirectional:
+            num_buckets //= 2
+            ret += (relative_position > 0).long() * num_buckets
+            n = torch.abs(relative_position)
+        else:
+            n = torch.max(-relative_position, torch.zeros_like(relative_position))
+        # now n is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = n < max_exact
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        val_if_large = max_exact + (
+                torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
+        ).to(torch.long)
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    def _cal_1d_pos_emb(self, hidden_states, position_ids, valid_span):
+        VISUAL_NUM = 196 + 1
+
+        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
+
+        if valid_span is not None:
+            # for the text part, if two words are not in the same line,
+            # set their distance to the max value (position_ids.shape[-1])
+            rel_pos_mat[(rel_pos_mat > 0) & (valid_span == False)] = position_ids.shape[1]
+            rel_pos_mat[(rel_pos_mat < 0) & (valid_span == False)] = -position_ids.shape[1]
+
+            # image-text, minimum distance
+            rel_pos_mat[:, -VISUAL_NUM:, :-VISUAL_NUM] = 0
+            rel_pos_mat[:, :-VISUAL_NUM, -VISUAL_NUM:] = 0
+
+        rel_pos = self.relative_position_bucket(
+            rel_pos_mat,
+            num_buckets=self.rel_pos_bins,
+            max_distance=self.max_rel_pos,
+        )
+        rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states)
+        rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2)
+        rel_pos = rel_pos.contiguous()
+        return rel_pos
+
+    def _cal_2d_pos_emb(self, hidden_states, bbox):
+        position_coord_x = bbox[:, :, 0]
+        position_coord_y = bbox[:, :, 3]
+        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
+        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
+        rel_pos_x = self.relative_position_bucket(
+            rel_pos_x_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_y = self.relative_position_bucket(
+            rel_pos_y_2d_mat,
+            num_buckets=self.rel_2d_pos_bins,
+            max_distance=self.max_rel_2d_pos,
+        )
+        rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
+        rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
+        rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2)
+        rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2)
+        rel_pos_x = rel_pos_x.contiguous()
+        rel_pos_y = rel_pos_y.contiguous()
+        rel_2d_pos = rel_pos_x + rel_pos_y
+        return rel_2d_pos
+
+    def forward(
+        self,
+        hidden_states,
+        bbox=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        position_ids=None,
+        Hp=None,
+        Wp=None,
+        valid_span=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+
+        rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids, valid_span) if self.has_relative_attention_bias else None
+        rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
+
+        if self.detection:
+            feat_out = {}
+            j = 0
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                        # return module(*inputs, past_key_value, output_attentions, rel_pos, rel_2d_pos)
+                        # The above line will cause error:
+                        # RuntimeError: Trying to backward through the graph a second time
+                        # (or directly access saved tensors after they have already been freed).
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    rel_pos,
+                    rel_2d_pos
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    rel_pos=rel_pos,
+                    rel_2d_pos=rel_2d_pos,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+            if self.detection and i in self.out_indices:
+                xp = hidden_states[:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp)
+                feat_out[self.out_features[j]] = self.ops[j](xp.contiguous())
+                j += 1
+
+        if self.detection:
+            return feat_out
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
+    """
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
+    def __init__(self, config, detection=False, out_features=None, image_only=False):
+        super().__init__(config)
+        self.config = config
+        assert not config.is_decoder and not config.add_cross_attention, \
+            "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
+        self.detection = detection
+        if not self.detection:
+            self.image_only = False
+        else:
+            assert config.visual_embed
+            self.image_only = image_only
+
+        if not self.image_only:
+            self.embeddings = LayoutLMv3Embeddings(config)
+        self.encoder = LayoutLMv3Encoder(config, detection=detection, out_features=out_features)
+
+        if config.visual_embed:
+            embed_dim = self.config.hidden_size
+            # use the default pre-training parameters for fine-tuning (e.g., input_size)
+            # when the input_size is larger in fine-tuning, we will interpolate the position embedding in forward
+            self.patch_embed = PatchEmbed(embed_dim=embed_dim)
+
+            patch_size = 16
+            size = int(self.config.input_size / patch_size)
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+            self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, embed_dim))
+            self.pos_drop = nn.Dropout(p=0.)
+
+            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+            self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+                self._init_visual_bbox(img_size=(size, size))
+
+            from functools import partial
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+            self.norm = norm_layer(embed_dim)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def _init_visual_bbox(self, img_size=(14, 14), max_len=1000):
+        visual_bbox_x = torch.div(torch.arange(0, max_len * (img_size[1] + 1), max_len),
+                                  img_size[1], rounding_mode='trunc')
+        visual_bbox_y = torch.div(torch.arange(0, max_len * (img_size[0] + 1), max_len),
+                                  img_size[0], rounding_mode='trunc')
+        visual_bbox = torch.stack(
+            [
+                visual_bbox_x[:-1].repeat(img_size[0], 1),
+                visual_bbox_y[:-1].repeat(img_size[1], 1).transpose(0, 1),
+                visual_bbox_x[1:].repeat(img_size[0], 1),
+                visual_bbox_y[1:].repeat(img_size[1], 1).transpose(0, 1),
+            ],
+            dim=-1,
+        ).view(-1, 4)
+
+        cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
+        self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)
+
+    def _calc_visual_bbox(self, device, dtype, bsz):  # , img_size=(14, 14), max_len=1000):
+        visual_bbox = self.visual_bbox.repeat(bsz, 1, 1)
+        visual_bbox = visual_bbox.to(device).type(dtype)
+        return visual_bbox
+
+    def forward_image(self, x):
+        if self.detection:
+            x = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None)
+        else:
+            x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        if self.pos_embed is not None and self.detection:
+            cls_tokens = cls_tokens + self.pos_embed[:, :1, :]
+
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None and not self.detection:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        x = self.norm(x)
+        return x
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        attention_mask=None,
+        token_type_ids=None,
+        valid_span=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        images=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        use_cache = False
+
+        # if input_ids is not None and inputs_embeds is not None:
+        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        if input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif images is not None:
+            batch_size = len(images)
+            device = images.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or images")
+
+        if not self.image_only:
+            # past_key_values_length
+            past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+            if attention_mask is None:
+                attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+            if token_type_ids is None:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if not self.image_only:
+            if bbox is None:
+                bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
+
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                bbox=bbox,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+
+        final_bbox = final_position_ids = None
+        Hp = Wp = None
+        if images is not None:
+            patch_size = 16
+            Hp, Wp = int(images.shape[2] / patch_size), int(images.shape[3] / patch_size)
+            visual_emb = self.forward_image(images)
+            if self.detection:
+                visual_attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
+                if self.image_only:
+                    attention_mask = visual_attention_mask
+                else:
+                    attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1)
+            elif self.image_only:
+                attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
+
+            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+                if self.config.has_spatial_attention_bias:
+                    visual_bbox = self._calc_visual_bbox(device, dtype=torch.long, bsz=batch_size)
+                    if self.image_only:
+                        final_bbox = visual_bbox
+                    else:
+                        final_bbox = torch.cat([bbox, visual_bbox], dim=1)
+
+                visual_position_ids = torch.arange(0, visual_emb.shape[1], dtype=torch.long, device=device).repeat(
+                    batch_size, 1)
+                if self.image_only:
+                    final_position_ids = visual_position_ids
+                else:
+                    position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0)
+                    position_ids = position_ids.expand_as(input_ids)
+                    final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1)
+
+            if self.image_only:
+                embedding_output = visual_emb
+            else:
+                embedding_output = torch.cat([embedding_output, visual_emb], dim=1)
+            embedding_output = self.LayerNorm(embedding_output)
+            embedding_output = self.dropout(embedding_output)
+        elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
+            if self.config.has_spatial_attention_bias:
+                final_bbox = bbox
+            if self.config.has_relative_attention_bias:
+                position_ids = self.embeddings.position_ids[:, :input_shape[1]]
+                position_ids = position_ids.expand_as(input_ids)
+                final_position_ids = position_ids
+
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, None, device)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            bbox=final_bbox,
+            position_ids=final_position_ids,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            Hp=Hp,
+            Wp=Wp,
+            valid_span=valid_span,
+        )
+
+        if self.detection:
+            return encoder_outputs
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class LayoutLMv3ClassificationHead(nn.Module):
+    """
+    Head for sentence-level classification tasks.
+    Reference: RobertaClassificationHead
+    """
+
+    def __init__(self, config, pool_feature=False):
+        super().__init__()
+        self.pool_feature = pool_feature
+        if pool_feature:
+            self.dense = nn.Linear(config.hidden_size*3, config.hidden_size)
+        else:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, x):
+        # x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.layoutlmv3 = LayoutLMv3Model(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if config.num_labels < 10:
+            self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        else:
+            self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        bbox=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        valid_span=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        images=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            bbox=bbox,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            images=images,
+            valid_span=valid_span,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.layoutlmv3 = LayoutLMv3Model(config)
+        # self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        valid_span=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        bbox=None,
+        images=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox=bbox,
+            images=images,
+            valid_span=valid_span,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.layoutlmv3 = LayoutLMv3Model(config)
+        self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
+
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        valid_span=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        bbox=None,
+        images=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.layoutlmv3(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            bbox=bbox,
+            images=images,
+            valid_span=valid_span,
+        )
+
+        sequence_output = outputs[0][:, 0, :]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..f340d3c6aca04b6567614e6aa221f7c542239305
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for LayoutLMv3, refer to RoBERTa."""
+
+from transformers.models.roberta import RobertaTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+class LayoutLMv3Tokenizer(RobertaTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fd75ff1d3bd7725025114e99320afd80823e9d0
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization classes for LayoutLMv3, refer to RoBERTa."""
+
+
+from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
+from transformers.utils import logging
+
+from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class LayoutLMv3TokenizerFast(RobertaTokenizerFast):
+    vocab_files_names = VOCAB_FILES_NAMES
+    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = LayoutLMv3Tokenizer
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..a624d60d5a75902f3c44d3dfbe1ef350cddf7427
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py
@@ -0,0 +1,151 @@
+from .visualizer import Visualizer
+from .rcnn_vl import *
+from .backbone import *
+
+from detectron2.config import get_cfg
+from detectron2.config import CfgNode as CN
+from detectron2.data import MetadataCatalog, DatasetCatalog
+from detectron2.data.datasets import register_coco_instances
+from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor
+
+
+def add_vit_config(cfg):
+    """
+    Add config for VIT.
+    """
+    _C = cfg
+
+    _C.MODEL.VIT = CN()
+
+    # CoaT model name.
+    _C.MODEL.VIT.NAME = ""
+
+    # Output features from CoaT backbone.
+    _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
+
+    _C.MODEL.VIT.IMG_SIZE = [224, 224]
+
+    _C.MODEL.VIT.POS_TYPE = "shared_rel"
+
+    _C.MODEL.VIT.DROP_PATH = 0.
+
+    _C.MODEL.VIT.MODEL_KWARGS = "{}"
+
+    _C.SOLVER.OPTIMIZER = "ADAMW"
+
+    _C.SOLVER.BACKBONE_MULTIPLIER = 1.0
+
+    _C.AUG = CN()
+
+    _C.AUG.DETR = False
+
+    _C.MODEL.IMAGE_ONLY = True
+    _C.PUBLAYNET_DATA_DIR_TRAIN = ""
+    _C.PUBLAYNET_DATA_DIR_TEST = ""
+    _C.FOOTNOTE_DATA_DIR_TRAIN = ""
+    _C.FOOTNOTE_DATA_DIR_VAL = ""
+    _C.SCIHUB_DATA_DIR_TRAIN = ""
+    _C.SCIHUB_DATA_DIR_TEST = ""
+    _C.JIAOCAI_DATA_DIR_TRAIN = ""
+    _C.JIAOCAI_DATA_DIR_TEST = ""
+    _C.ICDAR_DATA_DIR_TRAIN = ""
+    _C.ICDAR_DATA_DIR_TEST = ""
+    _C.M6DOC_DATA_DIR_TEST = ""
+    _C.DOCSTRUCTBENCH_DATA_DIR_TEST = ""
+    _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = ""
+    _C.CACHE_DIR = ""
+    _C.MODEL.CONFIG_PATH = ""
+
+    # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS
+    # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS
+    _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1
+
+
+def setup(args, device):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+
+    # add_coat_config(cfg)
+    add_vit_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2  # set threshold for this model
+    cfg.merge_from_list(args.opts)
+
+    # 使用统一的device配置
+    cfg.MODEL.DEVICE = device
+
+    cfg.freeze()
+    default_setup(cfg, args)
+
+    #@todo 可以删掉这块？
+    # register_coco_instances(
+    #     "scihub_train",
+    #     {},
+    #     cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
+    #     cfg.SCIHUB_DATA_DIR_TRAIN
+    # )
+
+    return cfg
+
+
+class DotDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(DotDict, self).__init__(*args, **kwargs)
+
+    def __getattr__(self, key):
+        if key not in self.keys():
+            return None
+        value = self[key]
+        if isinstance(value, dict):
+            value = DotDict(value)
+        return value
+
+    def __setattr__(self, key, value):
+        self[key] = value
+
+
+class Layoutlmv3_Predictor(object):
+    def __init__(self, weights, config_file, device):
+        layout_args = {
+            "config_file": config_file,
+            "resume": False,
+            "eval_only": False,
+            "num_gpus": 1,
+            "num_machines": 1,
+            "machine_rank": 0,
+            "dist_url": "tcp://127.0.0.1:57823",
+            "opts": ["MODEL.WEIGHTS", weights],
+        }
+        layout_args = DotDict(layout_args)
+
+        cfg = setup(layout_args, device)
+        self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption",
+                        "table_footnote", "isolate_formula", "formula_caption"]
+        MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping
+        self.predictor = DefaultPredictor(cfg)
+
+    def __call__(self, image, ignore_catids=[]):
+        # page_layout_result = {
+        #     "layout_dets": []
+        # }
+        layout_dets = []
+        outputs = self.predictor(image)
+        boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist()
+        labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist()
+        scores = outputs["instances"].to("cpu")._fields["scores"].tolist()
+        for bbox_idx in range(len(boxes)):
+            if labels[bbox_idx] in ignore_catids:
+                continue
+            layout_dets.append({
+                "category_id": labels[bbox_idx],
+                "poly": [
+                    boxes[bbox_idx][0], boxes[bbox_idx][1],
+                    boxes[bbox_idx][2], boxes[bbox_idx][1],
+                    boxes[bbox_idx][2], boxes[bbox_idx][3],
+                    boxes[bbox_idx][0], boxes[bbox_idx][3],
+                ],
+                "score": scores[bbox_idx]
+            })
+        return layout_dets
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..46b2e16102e8782eb675b518b7d870dc8d007ba8
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py
@@ -0,0 +1,163 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import logging
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import torch
+from torch import nn
+
+from detectron2.config import configurable
+from detectron2.structures import ImageList, Instances
+from detectron2.utils.events import get_event_storage
+
+from detectron2.modeling.backbone import Backbone, build_backbone
+from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
+
+from detectron2.modeling.meta_arch import GeneralizedRCNN
+
+from detectron2.modeling.postprocessing import detector_postprocess
+from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image
+from contextlib import contextmanager
+from itertools import count
+
+@META_ARCH_REGISTRY.register()
+class VLGeneralizedRCNN(GeneralizedRCNN):
+    """
+    Generalized R-CNN. Any models that contains the following three components:
+    1. Per-image feature extraction (aka backbone)
+    2. Region proposal generation
+    3. Per-region feature extraction and prediction
+    """
+
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+
+                * image: Tensor, image in (C, H, W) format.
+                * instances (optional): groundtruth :class:`Instances`
+                * proposals (optional): :class:`Instances`, precomputed proposals.
+
+                Other information that's included in the original dicts, such as:
+
+                * "height", "width" (int): the output resolution of the model, used in inference.
+                  See :meth:`postprocess` for details.
+
+        Returns:
+            list[dict]:
+                Each dict is the output for one input image.
+                The dict contains one key "instances" whose value is a :class:`Instances`.
+                The :class:`Instances` object has the following keys:
+                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
+        """
+        if not self.training:
+            return self.inference(batched_inputs)
+
+        images = self.preprocess_image(batched_inputs)
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        # features = self.backbone(images.tensor)
+        input = self.get_batch(batched_inputs, images)
+        features = self.backbone(input)
+
+        if self.proposal_generator is not None:
+            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+        else:
+            assert "proposals" in batched_inputs[0]
+            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+            proposal_losses = {}
+
+        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0:
+                self.visualize_training(batched_inputs, proposals)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        """
+        Run inference on the given inputs.
+
+        Args:
+            batched_inputs (list[dict]): same as in :meth:`forward`
+            detected_instances (None or list[Instances]): if not None, it
+                contains an `Instances` object per image. The `Instances`
+                object contains "pred_boxes" and "pred_classes" which are
+                known boxes in the image.
+                The inference will then skip the detection of bounding boxes,
+                and only predict other per-ROI outputs.
+            do_postprocess (bool): whether to apply post-processing on the outputs.
+
+        Returns:
+            When do_postprocess=True, same as in :meth:`forward`.
+            Otherwise, a list[Instances] containing raw network outputs.
+        """
+        assert not self.training
+
+        images = self.preprocess_image(batched_inputs)
+        # features = self.backbone(images.tensor)
+        input = self.get_batch(batched_inputs, images)
+        features = self.backbone(input)
+
+        if detected_instances is None:
+            if self.proposal_generator is not None:
+                proposals, _ = self.proposal_generator(images, features, None)
+            else:
+                assert "proposals" in batched_inputs[0]
+                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
+
+            results, _ = self.roi_heads(images, features, proposals, None)
+        else:
+            detected_instances = [x.to(self.device) for x in detected_instances]
+            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
+
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        else:
+            return results
+
+    def get_batch(self, examples, images):
+        if len(examples) >= 1 and "bbox" not in examples[0]:  # image_only
+            return {"images": images.tensor}
+
+        return input
+
+    def _batch_inference(self, batched_inputs, detected_instances=None):
+        """
+        Execute inference on a list of inputs,
+        using batch size = self.batch_size (e.g., 2), instead of the length of the list.
+
+        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
+        """
+        if detected_instances is None:
+            detected_instances = [None] * len(batched_inputs)
+
+        outputs = []
+        inputs, instances = [], []
+        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
+            inputs.append(input)
+            instances.append(instance)
+            if len(inputs) == 2 or idx == len(batched_inputs) - 1:
+                outputs.extend(
+                    self.inference(
+                        inputs,
+                        instances if instances[0] is not None else None,
+                        do_postprocess=True,  # False
+                    )
+                )
+                inputs, instances = [], []
+        return outputs
diff --git a/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py b/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8185984e66f0267be6368317c60dc543dcb69e87
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py
@@ -0,0 +1,1236 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+from detectron2.utils.file_io import PathManager
+
+from detectron2.utils.colormap import random_color
+
+import pdb
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+
+
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+
+_KEYPOINT_THRESHOLD = 0.05
+
+#CLASS_NAMES = ["footnote", "footer", "header"]
+
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+
+
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+
+    def area(self):
+        return self.mask.sum()
+
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+
+
+class _PanopticPrediction:
+    """
+    Unify different panoptic annotation/prediction formats
+    """
+
+    def __init__(self, panoptic_seg, segments_info, metadata=None):
+        if segments_info is None:
+            assert metadata is not None
+            # If "segments_info" is None, we assume "panoptic_img" is a
+            # H*W int32 image storing the panoptic_id in the format of
+            # category_id * label_divisor + instance_id. We reserve -1 for
+            # VOID label.
+            label_divisor = metadata.label_divisor
+            segments_info = []
+            for panoptic_label in np.unique(panoptic_seg.numpy()):
+                if panoptic_label == -1:
+                    # VOID region.
+                    continue
+                pred_class = panoptic_label // label_divisor
+                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
+                segments_info.append(
+                    {
+                        "id": int(panoptic_label),
+                        "category_id": int(pred_class),
+                        "isthing": bool(isthing),
+                    }
+                )
+        del metadata
+
+        self._seg = panoptic_seg
+
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+
+
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+
+    Returns:
+        list[str] or None
+    """
+    #class_names = CLASS_NAMES
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+            
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+
+
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+
+        buffer = np.frombuffer(s, dtype="uint8")
+
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+
+
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+
+    # TODO implement a fast, rasterized version using OpenCV
+
+    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        if metadata is None:
+            metadata = MetadataCatalog.get("__nonexist__")
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+        self.keypoint_threshold = _KEYPOINT_THRESHOLD
+
+    def draw_instance_predictions(self, predictions):
+        """
+        Draw instance-level prediction results on an image.
+
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            ]
+            alpha = 0.8
+        else:
+            colors = None
+            alpha = 0.5
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy()
+                    if predictions.has("pred_masks")
+                    else None
+                )
+            )
+            alpha = 0.3
+
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+        )
+        return self.output
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
+        """
+        Draw panoptic prediction annotations or results.
+
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
+                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
+                If None, category id of each pixel is computed by
+                ``pixel // metadata.label_divisor``.
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
+
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+
+            text = self.metadata.stuff_classes[category_idx]
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        labels = _create_text_labels(
+            category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
+        )
+
+        try:
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
+            ]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
+
+        return self.output
+
+    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
+
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+
+            boxes = [
+                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
+                if len(x["bbox"]) == 4
+                else x["bbox"]
+                for x in annos
+            ]
+
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("thing_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(
+                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
+            )
+
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
+
+        pan_seg = dic.get("pan_seg", None)
+        if pan_seg is None and "pan_seg_file_name" in dic:
+            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
+                pan_seg = Image.open(f)
+                pan_seg = np.asarray(pan_seg)
+                from panopticapi.utils import rgb2id
+
+                pan_seg = rgb2id(pan_seg)
+        if pan_seg is not None:
+            segments_info = dic["segments_info"]
+            pan_seg = torch.tensor(pan_seg)
+            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
+        return self.output
+
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5,
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    # skip small mask without polygon
+                    if len(masks[i].polygons) == 0:
+                        continue
+
+                    x0, y0, x1, y1 = masks[i].bbox()
+
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+
+        return self.output
+
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = len(boxes)
+
+        if assigned_colors is None:
+            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+
+        return self.output
+
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > self.keypoint_threshold:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+
+    """
+    Primitive drawing functions:
+    """
+
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+
+    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+
+        linewidth = max(self._default_font_size / 4, 1)
+
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Draw a rotated box with label on its top-left corner.
+
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+
+        return self.output
+
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn in the object's center of mass.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component small than this will not be shown.
+
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            # TODO: Use Path/PathPatch to draw vector graphics:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+
+        if text is not None and has_valid_segment:
+            # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+            largest_component_id = np.argmax(stats[1:, -1]) + 1
+
+            # draw text on the largest component, as well as other very large components.
+            for cid in range(1, _num_cc):
+                if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                    # median is more stable than centroid
+                    # center = centroids[largest_component_id]
+                    center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                    self.draw_text(text, center, color=lighter_color)
+        return self.output
+
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+
+    """
+    Internal methods:
+    """
+
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color
+
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.detach().numpy()
+        else:
+            return np.asarray(boxes)
+
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+
+        Returns:
+            list[GenericMask]:
+        """
+
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output
diff --git a/magic_pdf/model/pek_sub_modules/post_process.py b/magic_pdf/model/pek_sub_modules/post_process.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa050b612d29849b341b2850c9c4a1bcacf904dd
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/post_process.py
@@ -0,0 +1,36 @@
+import re
+
+def layout_rm_equation(layout_res):
+    rm_idxs = []
+    for idx, ele in enumerate(layout_res['layout_dets']):
+        if ele['category_id'] == 10:
+            rm_idxs.append(idx)
+    
+    for idx in rm_idxs[::-1]:
+        del layout_res['layout_dets'][idx]
+    return layout_res
+
+
+def get_croped_image(image_pil, bbox):
+    x_min, y_min, x_max, y_max = bbox
+    croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
+    return croped_img
+
+
+def latex_rm_whitespace(s: str):
+    """Remove unnecessary whitespace from LaTeX code.
+    """
+    text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
+    letter = '[a-zA-Z]'
+    noletter = '[\W_^\d]'
+    names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
+    s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
+    news = s
+    while True:
+        s = news
+        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
+        news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
+        news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
+        if news == s:
+            break
+    return s
\ No newline at end of file
diff --git a/magic_pdf/model/pek_sub_modules/self_modify.py b/magic_pdf/model/pek_sub_modules/self_modify.py
new file mode 100644
index 0000000000000000000000000000000000000000..47cb591ca60dbb0644ee59072f0cb7a49459770c
--- /dev/null
+++ b/magic_pdf/model/pek_sub_modules/self_modify.py
@@ -0,0 +1,260 @@
+import time
+import copy
+import base64
+import cv2
+import numpy as np
+from io import BytesIO
+from PIL import Image
+
+from paddleocr import PaddleOCR
+from paddleocr.ppocr.utils.logging import get_logger
+from paddleocr.ppocr.utils.utility import check_and_read, alpha_to_color, binarize_img
+from paddleocr.tools.infer.utility import draw_ocr_box_txt, get_rotate_crop_image, get_minarea_rect_crop
+logger = get_logger()
+
+def img_decode(content: bytes):
+    np_arr = np.frombuffer(content, dtype=np.uint8)
+    return cv2.imdecode(np_arr, cv2.IMREAD_UNCHANGED)
+
+def check_img(img):
+    if isinstance(img, bytes):
+        img = img_decode(img)
+    if isinstance(img, str):
+        image_file = img
+        img, flag_gif, flag_pdf = check_and_read(image_file)
+        if not flag_gif and not flag_pdf:
+            with open(image_file, 'rb') as f:
+                img_str = f.read()
+                img = img_decode(img_str)
+            if img is None:
+                try:
+                    buf = BytesIO()
+                    image = BytesIO(img_str)
+                    im = Image.open(image)
+                    rgb = im.convert('RGB')
+                    rgb.save(buf, 'jpeg')
+                    buf.seek(0)
+                    image_bytes = buf.read()
+                    data_base64 = str(base64.b64encode(image_bytes),
+                                      encoding="utf-8")
+                    image_decode = base64.b64decode(data_base64)
+                    img_array = np.frombuffer(image_decode, np.uint8)
+                    img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
+                except:
+                    logger.error("error in loading image:{}".format(image_file))
+                    return None
+        if img is None:
+            logger.error("error in loading image:{}".format(image_file))
+            return None
+    if isinstance(img, np.ndarray) and len(img.shape) == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+
+    return img
+
+def sorted_boxes(dt_boxes):
+    """
+    Sort text boxes in order from top to bottom, left to right
+    args:
+        dt_boxes(array):detected text boxes with shape [4, 2]
+    return:
+        sorted boxes(array) with shape [4, 2]
+    """
+    num_boxes = dt_boxes.shape[0]
+    sorted_boxes = sorted(dt_boxes, key=lambda x: (x[0][1], x[0][0]))
+    _boxes = list(sorted_boxes)
+
+    for i in range(num_boxes - 1):
+        for j in range(i, -1, -1):
+            if abs(_boxes[j + 1][0][1] - _boxes[j][0][1]) < 10 and \
+                    (_boxes[j + 1][0][0] < _boxes[j][0][0]):
+                tmp = _boxes[j]
+                _boxes[j] = _boxes[j + 1]
+                _boxes[j + 1] = tmp
+            else:
+                break
+    return _boxes
+
+
+def formula_in_text(mf_bbox, text_bbox):
+    x1, y1, x2, y2 = mf_bbox
+    x3, y3 = text_bbox[0]
+    x4, y4 = text_bbox[2]
+    left_box, right_box = None, None
+    same_line = abs((y1+y2)/2 - (y3+y4)/2) / abs(y4-y3) < 0.2
+    if not same_line:
+        return False, left_box, right_box
+    else:
+        drop_origin = False
+        left_x = x1 - 1
+        right_x = x2 + 1
+        if x3 < x1 and x2 < x4:
+            drop_origin = True
+            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
+            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
+        if x3 < x1 and x1 <= x4 <= x2:
+            drop_origin = True
+            left_box = np.array([text_bbox[0], [left_x, text_bbox[1][1]], [left_x, text_bbox[2][1]], text_bbox[3]]).astype('float32')
+        if x1 <= x3 <= x2 and x2 < x4:
+            drop_origin = True
+            right_box = np.array([[right_x, text_bbox[0][1]], text_bbox[1], text_bbox[2], [right_x, text_bbox[3][1]]]).astype('float32')
+        if x1 <= x3 < x4 <= x2:
+            drop_origin = True
+        return drop_origin, left_box, right_box
+
+    
+def update_det_boxes(dt_boxes, mfdetrec_res):
+    new_dt_boxes = dt_boxes
+    for mf_box in mfdetrec_res:
+        flag, left_box, right_box = False, None, None
+        for idx, text_box in enumerate(new_dt_boxes):
+            ret, left_box, right_box = formula_in_text(mf_box['bbox'], text_box)
+            if ret:
+                new_dt_boxes.pop(idx)
+                if left_box is not None:
+                    new_dt_boxes.append(left_box)
+                if right_box is not None:
+                    new_dt_boxes.append(right_box)
+                break
+            
+    return new_dt_boxes
+
+class ModifiedPaddleOCR(PaddleOCR):
+    def ocr(self, img, det=True, rec=True, cls=True, bin=False, inv=False, mfd_res=None, alpha_color=(255, 255, 255)):
+        """
+        OCR with PaddleOCR
+        args：
+            img: img for OCR, support ndarray, img_path and list or ndarray
+            det: use text detection or not. If False, only rec will be exec. Default is True
+            rec: use text recognition or not. If False, only det will be exec. Default is True
+            cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
+            bin: binarize image to black and white. Default is False.
+            inv: invert image colors. Default is False.
+            alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
+        """
+        assert isinstance(img, (np.ndarray, list, str, bytes))
+        if isinstance(img, list) and det == True:
+            logger.error('When input a list of images, det must be false')
+            exit(0)
+        if cls == True and self.use_angle_cls == False:
+            pass
+            # logger.warning(
+            #     'Since the angle classifier is not initialized, it will not be used during the forward process'
+            # )
+
+        img = check_img(img)
+        # for infer pdf file
+        if isinstance(img, list):
+            if self.page_num > len(img) or self.page_num == 0:
+                self.page_num = len(img)
+            imgs = img[:self.page_num]
+        else:
+            imgs = [img]
+
+        def preprocess_image(_image):
+            _image = alpha_to_color(_image, alpha_color)
+            if inv:
+                _image = cv2.bitwise_not(_image)
+            if bin:
+                _image = binarize_img(_image)
+            return _image
+
+        if det and rec:
+            ocr_res = []
+            for idx, img in enumerate(imgs):
+                img = preprocess_image(img)
+                dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
+                if not dt_boxes and not rec_res:
+                    ocr_res.append(None)
+                    continue
+                tmp_res = [[box.tolist(), res]
+                           for box, res in zip(dt_boxes, rec_res)]
+                ocr_res.append(tmp_res)
+            return ocr_res
+        elif det and not rec:
+            ocr_res = []
+            for idx, img in enumerate(imgs):
+                img = preprocess_image(img)
+                dt_boxes, elapse = self.text_detector(img)
+                if not dt_boxes:
+                    ocr_res.append(None)
+                    continue
+                tmp_res = [box.tolist() for box in dt_boxes]
+                ocr_res.append(tmp_res)
+            return ocr_res
+        else:
+            ocr_res = []
+            cls_res = []
+            for idx, img in enumerate(imgs):
+                if not isinstance(img, list):
+                    img = preprocess_image(img)
+                    img = [img]
+                if self.use_angle_cls and cls:
+                    img, cls_res_tmp, elapse = self.text_classifier(img)
+                    if not rec:
+                        cls_res.append(cls_res_tmp)
+                rec_res, elapse = self.text_recognizer(img)
+                ocr_res.append(rec_res)
+            if not rec:
+                return cls_res
+            return ocr_res
+        
+    def __call__(self, img, cls=True, mfd_res=None):
+        time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
+
+        if img is None:
+            logger.debug("no valid image provided")
+            return None, None, time_dict
+
+        start = time.time()
+        ori_im = img.copy()
+        dt_boxes, elapse = self.text_detector(img)
+        time_dict['det'] = elapse
+
+        if dt_boxes is None:
+            logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
+            end = time.time()
+            time_dict['all'] = end - start
+            return None, None, time_dict
+        else:
+            logger.debug("dt_boxes num : {}, elapsed : {}".format(
+                len(dt_boxes), elapse))
+        img_crop_list = []
+
+        dt_boxes = sorted_boxes(dt_boxes)
+        if mfd_res:
+            bef = time.time()
+            dt_boxes = update_det_boxes(dt_boxes, mfd_res)
+            aft = time.time()
+            logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
+                len(dt_boxes), aft-bef))
+
+        for bno in range(len(dt_boxes)):
+            tmp_box = copy.deepcopy(dt_boxes[bno])
+            if self.args.det_box_type == "quad":
+                img_crop = get_rotate_crop_image(ori_im, tmp_box)
+            else:
+                img_crop = get_minarea_rect_crop(ori_im, tmp_box)
+            img_crop_list.append(img_crop)
+        if self.use_angle_cls and cls:
+            img_crop_list, angle_list, elapse = self.text_classifier(
+                img_crop_list)
+            time_dict['cls'] = elapse
+            logger.debug("cls num  : {}, elapsed : {}".format(
+                len(img_crop_list), elapse))
+
+        rec_res, elapse = self.text_recognizer(img_crop_list)
+        time_dict['rec'] = elapse
+        logger.debug("rec_res num  : {}, elapsed : {}".format(
+            len(rec_res), elapse))
+        if self.args.save_crop_res:
+            self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
+                                   rec_res)
+        filter_boxes, filter_rec_res = [], []
+        for box, rec_result in zip(dt_boxes, rec_res):
+            text, score = rec_result
+            if score >= self.drop_score:
+                filter_boxes.append(box)
+                filter_rec_res.append(rec_result)
+        end = time.time()
+        time_dict['all'] = end - start
+        return filter_boxes, filter_rec_res, time_dict
\ No newline at end of file
diff --git a/magic_pdf/model/pp_structure_v2.py b/magic_pdf/model/pp_structure_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b593086f8d915e3d8e40c740fcfbd857ae6330
--- /dev/null
+++ b/magic_pdf/model/pp_structure_v2.py
@@ -0,0 +1,87 @@
+import random
+
+from loguru import logger
+
+try:
+    from paddleocr import PPStructure
+except ImportError:
+    logger.error('paddleocr not installed, please install by "pip install magic-pdf[cpu]" or "pip install magic-pdf[gpu]"')
+    exit(1)
+
+
+def region_to_bbox(region):
+    x0 = region[0][0]
+    y0 = region[0][1]
+    x1 = region[2][0]
+    y1 = region[2][1]
+    return [x0, y0, x1, y1]
+
+
+class CustomPaddleModel:
+    def __init__(self, ocr: bool = False, show_log: bool = False):
+        self.model = PPStructure(table=False, ocr=ocr, show_log=show_log)
+
+    def __call__(self, img):
+        try:
+            import cv2
+        except ImportError:
+            logger.error("opencv-python not installed, please install by pip.")
+            exit(1)
+        # 将RGB图片转换为BGR格式适配paddle
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        result = self.model(img)
+        spans = []
+        for line in result:
+            line.pop("img")
+            """
+            为paddle输出适配type no.    
+            title: 0 # 标题
+            text: 1 # 文本
+            header: 2 # abandon
+            footer: 2 # abandon
+            reference: 1 # 文本 or abandon
+            equation: 8 # 行间公式 block
+            equation: 14 # 行间公式 text
+            figure: 3 # 图片
+            figure_caption: 4 # 图片描述
+            table: 5 # 表格
+            table_caption: 6 # 表格描述
+            """
+            if line["type"] == "title":
+                line["category_id"] = 0
+            elif line["type"] in ["text", "reference"]:
+                line["category_id"] = 1
+            elif line["type"] == "figure":
+                line["category_id"] = 3
+            elif line["type"] == "figure_caption":
+                line["category_id"] = 4
+            elif line["type"] == "table":
+                line["category_id"] = 5
+            elif line["type"] == "table_caption":
+                line["category_id"] = 6
+            elif line["type"] == "equation":
+                line["category_id"] = 8
+            elif line["type"] in ["header", "footer"]:
+                line["category_id"] = 2
+            else:
+                logger.warning(f"unknown type: {line['type']}")
+
+            # 兼容不输出score的paddleocr版本
+            if line.get("score") is None:
+                line["score"] = 0.5 + random.random() * 0.5
+
+            res = line.pop("res", None)
+            if res is not None and len(res) > 0:
+                for span in res:
+                    new_span = {
+                        "category_id": 15,
+                        "bbox": region_to_bbox(span["text_region"]),
+                        "score": span["confidence"],
+                        "text": span["text"],
+                    }
+                    spans.append(new_span)
+
+        if len(spans) > 0:
+            result.extend(spans)
+
+        return result
diff --git a/magic_pdf/para/__init__.py b/magic_pdf/para/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/para/block_continuation_processor.py b/magic_pdf/para/block_continuation_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4aa59d7728f733ed74abb627b52fe8cca5650ac
--- /dev/null
+++ b/magic_pdf/para/block_continuation_processor.py
@@ -0,0 +1,562 @@
+import os
+import unicodedata
+
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_similar_font_type(self, font_type1, font_type2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+
+        if isinstance(font_type1, list):
+            font_type1 = font_type1[0] if font_type1 else ""
+        if isinstance(font_type2, list):
+            font_type2 = font_type2[0] if font_type2 else ""
+
+        if font_type1 == font_type2:
+            return True
+
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type1, font_type2]))
+
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type1), len(font_type2)) * prefix_length_ratio)
+
+        return common_prefix_length >= min_prefix_length
+
+    def __is_same_block_font(self, block1, block2):
+        """
+        This function compares the font of block1 and block2
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block1, "block_font_type", "")
+        block_1_font_size = safe_get(block1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block1, "avg_char_width", 0)
+
+        block_2_font_type = safe_get(block2, "block_font_type", "")
+        block_2_font_size = safe_get(block2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block2, "avg_char_width", 0)
+
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+
+        block_1_text = safe_get(block1, "text", "")
+        block_2_text = safe_get(block2, "text", "")
+
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+
+        block_font_size_condtion = abs(block_1_font_size - block_2_font_size) < 1
+
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condtion
+        )
+
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+
+    def __is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0] == " "
+                    or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    or (self._is_chinese_char(para_2_text[0]))
+                    or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+
+        return False
+
+    def _is_block_consistent(self, block1, block2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block1, block2)
+
+    def _is_para_continued(self, para1, para2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self.__is_para_font_consistent(para1, para2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para1, para2)
+
+        return is_para_font_consistent and is_para_puncs_consistent
+
+    def _are_boundaries_of_block_consistent(self, block1, block2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+
+        last_line_of_block1 = block1["lines"][-1]
+        first_line_of_block2 = block2["lines"][0]
+
+        spans_of_last_line_of_block1 = last_line_of_block1["spans"]
+        spans_of_first_line_of_block2 = first_line_of_block2["spans"]
+
+        font_type_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["font"].lower()
+        font_size_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["size"]
+        font_color_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["color"]
+        font_flags_of_last_line_of_block1 = spans_of_last_line_of_block1[0]["flags"]
+
+        font_type_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["font"].lower()
+        font_size_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["size"]
+        font_color_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["color"]
+        font_flags_of_first_line_of_block2 = spans_of_first_line_of_block2[0]["flags"]
+
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block1, font_type_of_first_line_of_block2)
+            and abs(font_size_of_last_line_of_block1 - font_size_of_first_line_of_block2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block1 == font_flags_of_first_line_of_block2
+        )
+
+    def _get_last_paragraph(self, block):
+        """
+        Retrieves the last paragraph from a block.
+
+        Parameters
+        ----------
+        block : dict
+            The block from which to retrieve the paragraph.
+
+        Returns
+        -------
+        dict
+            The last paragraph of the block.
+        """
+        if block["paras"]:
+            last_para_key = list(block["paras"].keys())[-1]
+            return block["paras"][last_para_key]
+        else:
+            return None
+
+    def _get_first_paragraph(self, block):
+        """
+        Retrieves the first paragraph from a block.
+
+        Parameters
+        ----------
+        block : dict
+            The block from which to retrieve the paragraph.
+
+        Returns
+        -------
+        dict
+            The first paragraph of the block.
+        """
+        if block["paras"]:
+            first_para_key = list(block["paras"].keys())[0]
+            return block["paras"][first_para_key]
+        else:
+            return None
+
+    def should_merge_next_para(self, curr_para, next_para):
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+
+    def batch_tag_paras(self, pdf_dict):
+        the_last_page_id = len(pdf_dict) - 1
+
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+
+        return pdf_dict
+
+    def find_block_by_id(self, para_blocks, block_id):
+        for block in para_blocks:
+            if block.get("block_id") == block_id:
+                return block
+        return None
+
+    def batch_merge_paras(self, pdf_dict):
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+
+                    for para_id, curr_para in list(paras.items()):
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+
+                        while curr_para.get("merge_next_para"):
+                            next_para_location = curr_para.get("next_para_location")
+                            if not next_para_location:
+                                break
+
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+                            if not next_block:
+                                break
+
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+
+        return pdf_dict
diff --git a/magic_pdf/para/block_termination_processor.py b/magic_pdf/para/block_termination_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..54b393caa54d29ef5a49f0ebcb28d7914cc09356
--- /dev/null
+++ b/magic_pdf/para/block_termination_processor.py
@@ -0,0 +1,480 @@
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+
+class BlockTerminationProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+
+        else:
+            return False
+
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        indent_ratio = 1
+
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+
+        return (is_end_of_para, end_confidence, decision_path)
+
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+
+        Returns
+        -------
+        processed_block : dict
+
+        """
+
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+
+            # font_types = [span["font"] for line in lines for span in line["spans"]]
+            # most_common_font_type = max(set(font_types), key=font_types.count) if font_types else None
+
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+
+            """
+            Start processing paragraphs.
+            """
+
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+
+        block["paras"] = processed_paras
+
+        return block
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+
+        """
+
+        num_paras = 0
+
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+
+                page["para_blocks"] = para_blocks
+
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
diff --git a/magic_pdf/para/commons.py b/magic_pdf/para/commons.py
new file mode 100644
index 0000000000000000000000000000000000000000..716f3074476021fd5a7da32deb4da40fa92956a1
--- /dev/null
+++ b/magic_pdf/para/commons.py
@@ -0,0 +1,222 @@
+import sys
+
+from magic_pdf.libs.commons import fitz
+from termcolor import cprint
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+
+    return True
+
+
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+
+
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+
+    Parameters
+    ----------
+    lines : list
+        lines
+
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+
+
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+
+
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+
+
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+
+    if last_non_space_char is None:
+        return False
+
+    return last_non_space_char in end_puncs
+
+
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
diff --git a/magic_pdf/para/denoise.py b/magic_pdf/para/denoise.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d49f3834e25b6b9d7c07203810ec9fe8a6618a5
--- /dev/null
+++ b/magic_pdf/para/denoise.py
@@ -0,0 +1,246 @@
+import math
+
+from collections import defaultdict
+from magic_pdf.para.commons import *
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class HeaderFooterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def get_most_common_bboxes(self, bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
+        """
+        This function gets the most common bboxes from the bboxes
+
+        Parameters
+        ----------
+        bboxes : list
+            bboxes
+        page_height : float
+            height of the page
+        position : str, optional
+            "top" or "bottom", by default "top"
+        threshold : float, optional
+            threshold, by default 0.25
+        num_bboxes : int, optional
+            number of bboxes to return, by default 3
+        min_frequency : int, optional
+            minimum frequency of the bbox, by default 2
+
+        Returns
+        -------
+        common_bboxes : list
+            common bboxes
+        """
+        # Filter bbox by position
+        if position == "top":
+            filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
+        else:
+            filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
+
+        # Find the most common bbox
+        bbox_count = defaultdict(int)
+        for bbox in filtered_bboxes:
+            bbox_count[tuple(bbox)] += 1
+
+        # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
+        common_bboxes = [
+            bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
+        ][:num_bboxes]
+        return common_bboxes
+
+    def detect_footer_header(self, result_dict, similarity_threshold=0.5):
+        """
+        This function detects the header and footer of the document.
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
+            return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
+
+        def is_single_line_block(block):
+            # Determine based on the width and height of the block
+            block_width = block["X1"] - block["X0"]
+            block_height = block["bbox"][3] - block["bbox"][1]
+
+            # If the height of the block is close to the average character height and the width is large, it is considered a single line
+            return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
+
+        # Traverse all blocks in the document
+        single_preproc_blocks = 0
+        total_blocks = 0
+        single_preproc_blocks = 0
+
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        total_blocks += 1
+                        if is_single_line_block(block):
+                            single_preproc_blocks += 1
+
+        # If there are no blocks, skip the header and footer detection
+        if total_blocks == 0:
+            print("No blocks found. Skipping header/footer detection.")
+            return result_dict
+
+        # If most of the blocks are single-line, skip the header and footer detection
+        if single_preproc_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
+            return result_dict
+
+        # Collect the bounding boxes of all blocks
+        all_bboxes = []
+        all_texts = []
+
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        all_bboxes.append(block["bbox"])
+
+        # Get the height of the page
+        page_height = max(bbox[3] for bbox in all_bboxes)
+
+        # Get the most common bbox lists for headers and footers
+        common_header_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
+        common_footer_bboxes = self.get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
+
+        # Detect and mark headers and footers
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_key, block in blocks.items():
+                    if block_key.startswith("block_"):
+                        bbox = block["bbox"]
+                        text = block["text"]
+
+                        is_header = compare_bbox_with_list(bbox, common_header_bboxes)
+                        is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
+
+                        block["is_header"] = int(is_header)
+                        block["is_footer"] = int(is_footer)
+
+        return result_dict
+
+
+class NonHorizontalTextProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def detect_non_horizontal_texts(self, result_dict):
+        """
+        This function detects watermarks and vertical margin notes in the document.
+
+        Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+        If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
+        If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
+
+        Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+        If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+        If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
+
+
+        Parameters
+        ----------
+        result_dict : dict
+            The result dictionary.
+
+        Returns
+        -------
+        result_dict : dict
+            The updated result dictionary.
+        """
+        # Dictionary to store information about potential watermarks
+        potential_watermarks = {}
+        potential_margin_notes = {}
+
+        for page_id, page_content in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_id, block_data in page_content.items():
+                    if block_id.startswith("block_"):
+                        if "dir" in block_data:
+                            coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
+
+                            angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                            angle = abs(math.degrees(angle))
+
+                            if angle > 5 and angle < 85:  # Check if direction is watermarks
+                                if coordinates_text in potential_watermarks:
+                                    potential_watermarks[coordinates_text] += 1
+                                else:
+                                    potential_watermarks[coordinates_text] = 1
+
+                            if angle > 85 and angle < 105:  # Check if direction is vertical
+                                if coordinates_text in potential_margin_notes:
+                                    potential_margin_notes[coordinates_text] += 1  # Increment count
+                                else:
+                                    potential_margin_notes[coordinates_text] = 1  # Initialize count
+
+        # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+        watermark_threshold = len(result_dict) // 2
+        watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+
+        # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+        margin_note_threshold = len(result_dict) // 2
+        margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+
+        # Add watermark information to the result dictionary
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                for block_id, block_data in blocks.items():
+                    coordinates_text = (block_data["bbox"], block_data["text"])
+                    if coordinates_text in watermarks:
+                        block_data["is_watermark"] = 1
+                    else:
+                        block_data["is_watermark"] = 0
+
+                    if coordinates_text in margin_notes:
+                        block_data["is_vertical_margin_note"] = 1
+                    else:
+                        block_data["is_vertical_margin_note"] = 0
+
+        return result_dict
+
+
+class NoiseRemover:
+    def __init__(self) -> None:
+        pass
+
+    def skip_data_noises(self, result_dict):
+        """
+        This function skips the data noises, including overlap blocks, header, footer, watermark, vertical margin note, title
+        """
+        filtered_result_dict = {}
+        for page_id, blocks in result_dict.items():
+            if page_id.startswith("page_"):
+                filtered_blocks = {}
+                for block_id, block in blocks.items():
+                    if block_id.startswith("block_"):
+                        if any(
+                            block.get(key, 0)
+                            for key in [
+                                "is_overlap",
+                                "is_header",
+                                "is_footer",
+                                "is_watermark",
+                                "is_vertical_margin_note",
+                                "is_block_title",
+                            ]
+                        ):
+                            continue
+                        filtered_blocks[block_id] = block
+                if filtered_blocks:
+                    filtered_result_dict[page_id] = filtered_blocks
+
+        return filtered_result_dict
diff --git a/magic_pdf/para/draw.py b/magic_pdf/para/draw.py
new file mode 100644
index 0000000000000000000000000000000000000000..041a21bcbeb4522b43e3ea1b70cba0ae857323a6
--- /dev/null
+++ b/magic_pdf/para/draw.py
@@ -0,0 +1,121 @@
+from magic_pdf.libs.commons import fitz
+
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        pdf_doc = open_pdf(input_pdf_path)
+
+        if pdf_dic is None:
+            pdf_dic = {}
+
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
diff --git a/magic_pdf/para/exceptions.py b/magic_pdf/para/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..75b19fac480ec1f1fa914f06e85228a67b3a9d7a
--- /dev/null
+++ b/magic_pdf/para/exceptions.py
@@ -0,0 +1,198 @@
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+
+        if page_num == 0:
+            return None
+
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+
+        return None
+
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
diff --git a/magic_pdf/para/layout_match_processor.py b/magic_pdf/para/layout_match_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f93f1a872e183b7f45c23244579dca97a92617a
--- /dev/null
+++ b/magic_pdf/para/layout_match_processor.py
@@ -0,0 +1,40 @@
+import math
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def batch_process_blocks(self, pdf_dict):
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+
+                    # Use math.ceil function to enlarge each value of x0, y0, x1, y1 of each layout_bbox
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+
+                    blocks["para_blocks"] = para_blocks
+
+        return pdf_dict
diff --git a/magic_pdf/para/para_pipeline.py b/magic_pdf/para/para_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..13911c1a7e10c37ceba568fcf8f8a379050e4f70
--- /dev/null
+++ b/magic_pdf/para/para_pipeline.py
@@ -0,0 +1,297 @@
+import os
+import json
+
+from magic_pdf.para.commons import *
+
+from magic_pdf.para.raw_processor import RawBlockProcessor
+from magic_pdf.para.layout_match_processor import LayoutFilterProcessor
+from magic_pdf.para.stats import BlockStatisticsCalculator
+from magic_pdf.para.stats import DocStatisticsCalculator
+from magic_pdf.para.title_processor import TitleProcessor
+from magic_pdf.para.block_termination_processor import BlockTerminationProcessor
+from magic_pdf.para.block_continuation_processor import BlockContinuationProcessor
+from magic_pdf.para.draw import DrawAnnos
+from magic_pdf.para.exceptions import (
+    DenseSingleLineBlockException,
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DiscardByException,
+)
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+
+        error_info = None
+
+        output_json_file = ""
+        output_dir = ""
+
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+
+            return pdf_dic_json_fpath
+
+        """
+        Preprocess the lines of block
+        """
+        # Find and replace the interline and inline equations, should be better done before the paragraph processing
+        # Create "para_blocks" for each page.
+        # equationProcessor = EquationsProcessor()
+        # pdf_dic = equationProcessor.batch_process_blocks(pdf_info_dict)
+
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        pdf_dic = titleProcessor.batch_process_blocks_detect_titles(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+
+        pdf_dic = titleProcessor.batch_process_blocks_recog_title_level(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+
+        """
+        if any(
+            info is not None
+            for info in [
+                is_discard_by_single_line_block,
+                is_discard_by_title_detection,
+                is_discard_by_title_level,
+                is_discard_by_split_para,
+                is_discard_by_merge_para,
+            ]
+        ):
+            error_info = next(
+                (
+                    info
+                    for info in [
+                        is_discard_by_single_line_block,
+                        is_discard_by_title_detection,
+                        is_discard_by_title_level,
+                        is_discard_by_split_para,
+                        is_discard_by_merge_para,
+                    ]
+                    if info is not None
+                ),
+                None,
+            )
+            return pdf_dic, error_info
+
+        if any(
+            info is not None
+            for info in [
+                is_discard_by_single_line_block,
+                is_discard_by_title_detection,
+                is_discard_by_title_level,
+                is_discard_by_split_para,
+                is_discard_by_merge_para,
+            ]
+        ):
+            error_info = next(
+                (
+                    info
+                    for info in [
+                        is_discard_by_single_line_block,
+                        is_discard_by_title_detection,
+                        is_discard_by_title_level,
+                        is_discard_by_split_para,
+                        is_discard_by_merge_para,
+                    ]
+                    if info is not None
+                ),
+                None,
+            )
+            return pdf_dic, error_info
+        """
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+
+        if error_info is not None:
+            return pdf_dic, error_info
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+
+        return pdf_dic, error_info
diff --git a/magic_pdf/para/para_split.py b/magic_pdf/para/para_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9808abeda4f2130882eb7a40c611002fdb7e9dc
--- /dev/null
+++ b/magic_pdf/para/para_split.py
@@ -0,0 +1,644 @@
+from sklearn.cluster import DBSCAN
+import numpy as np
+from loguru import logger
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
+from magic_pdf.libs.ocr_content_type import ContentType
+
+
+LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？',"：", ":", ")", "）", ";"]
+INLINE_EQUATION = ContentType.InlineEquation
+INTERLINE_EQUATION = ContentType.InterlineEquation
+TEXT = ContentType.Text
+
+
+def __get_span_text(span):
+    c = span.get('content', '')
+    if len(c)==0:
+        c = span.get('image_path', '')
+        
+    return c
+    
+
+def __detect_list_lines(lines, new_layout_bboxes, lang):
+    """
+    探测是否包含了列表，并且把列表的行分开.
+    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
+    """
+    def find_repeating_patterns(lst):
+        indices = []
+        ones_indices = []
+        i = 0
+        while i < len(lst) - 1:  # 确保余下元素至少有2个
+            if lst[i] == 1 and lst[i+1] in [2, 3]:  # 额外检查以防止连续出现的1
+                start = i
+                ones_in_this_interval = [i]
+                i += 1
+                while i < len(lst) and lst[i] in [2, 3]:
+                    i += 1
+                # 验证下一个序列是否符合条件
+                if i < len(lst) - 1 and lst[i] == 1 and lst[i+1] in [2, 3] and lst[i-1] in [2, 3]:
+                    while i < len(lst) and lst[i] in [1, 2, 3]:
+                        if lst[i] == 1:
+                            ones_in_this_interval.append(i)
+                        i += 1
+                    indices.append((start, i - 1))
+                    ones_indices.append(ones_in_this_interval)
+                else:
+                    i += 1
+            else:
+                i += 1
+        return indices, ones_indices
+    """===================="""
+    def split_indices(slen, index_array):
+        result = []
+        last_end = 0
+        
+        for start, end in sorted(index_array):
+            if start > last_end:
+                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
+                result.append(('text', last_end, start - 1))
+            # 区间内标记为"list"
+            result.append(('list', start, end))
+            last_end = end + 1
+
+        if last_end < slen:
+            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
+            result.append(('text', last_end, slen - 1))
+
+        return result
+    """===================="""
+
+    if lang!='en':
+        return lines, None
+    else:
+        total_lines = len(lines)
+        line_fea_encode = []
+        """
+        对每一行进行特征编码，编码规则如下：
+        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+        2. 如果顶格，其他非大写开头编码为4
+        3. 如果非顶格，首字符大写，编码为2
+        4. 如果非顶格，首字符非大写编码为3
+        """
+        for l in lines:
+            first_char = __get_span_text(l['spans'][0])[0]
+            layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0]
+            if l['bbox'][0] == layout_left:
+                if first_char.isupper() or first_char.isdigit():
+                    line_fea_encode.append(1)
+                else:
+                    line_fea_encode.append(4)
+            else:
+                if first_char.isupper():
+                    line_fea_encode.append(2)
+                else:
+                    line_fea_encode.append(3)
+                    
+        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+        
+        list_indice, list_start_idx  = find_repeating_patterns(line_fea_encode)
+        if len(list_indice)>0:
+            logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+        
+        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+        segments = []
+        for start, end in list_indice:
+            for i in range(start, end+1):
+                if i>0:
+                    if line_fea_encode[i] == 4:
+                        logger.info(f"列表行的第{i}行不是顶格的")
+                        break
+            else:
+                logger.info(f"列表行的第{start}到第{end}行是列表")
+        
+        return split_indices(total_lines, list_indice), list_start_idx
+        
+            
+
+def __valign_lines(blocks, layout_bboxes):
+    """
+    在一个layoutbox内对齐行的左侧和右侧。
+    扫描行的左侧和右侧，如果x0, x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
+    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。
+    """
+    
+    min_distance = 3
+    min_sample = 2
+    new_layout_bboxes = []
+    
+    for layout_box in layout_bboxes:
+        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], layout_box['layout_bbox'])]
+        if len(blocks_in_layoutbox)==0:
+            continue
+        
+        x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
+        x0_uniq_label = np.unique(x0_clusters.labels_)
+        x1_uniq_label = np.unique(x1_clusters.labels_)
+        
+        x0_2_new_val = {} # 存储旧值对应的新值映射
+        x1_2_new_val = {}
+        for label in x0_uniq_label:
+            if label==-1:
+                continue
+            x0_index_of_label = np.where(x0_clusters.labels_==label)
+            x0_raw_val = x0_lst[x0_index_of_label][:,0]
+            x0_new_val = np.min(x0_lst[x0_index_of_label][:,0])
+            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
+        for label in x1_uniq_label:
+            if label==-1:
+                continue
+            x1_index_of_label = np.where(x1_clusters.labels_==label)
+            x1_raw_val = x1_lst[x1_index_of_label][:,0]
+            x1_new_val = np.max(x1_lst[x1_index_of_label][:,0])
+            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
+        
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 in x0_2_new_val:
+                    line['bbox'][0] = int(x0_2_new_val[x0])
+
+                if x1 in x1_2_new_val:
+                    line['bbox'][2] = int(x1_2_new_val[x1])
+            # 其余对不齐的保持不动
+            
+        # 由于修改了block里的line长度，现在需要重新计算block的bbox
+        for block in blocks_in_layoutbox:
+            block['bbox'] = [min([line['bbox'][0] for line in block['lines']]), 
+                            min([line['bbox'][1] for line in block['lines']]), 
+                            max([line['bbox'][2] for line in block['lines']]), 
+                            max([line['bbox'][3] for line in block['lines']])]
+            
+        """新计算layout的bbox，因为block的bbox变了。"""
+        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
+        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
+        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
+        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
+        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
+            
+    return new_layout_bboxes
+
+
+def __align_text_in_layout(blocks, layout_bboxes):
+    """
+    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
+    """
+    for layout in layout_bboxes:
+        lb = layout['layout_bbox']
+        blocks_in_layoutbox = [b for b in blocks if is_in_layout(b['bbox'], lb)]
+        if len(blocks_in_layoutbox)==0:
+            continue
+        
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 < lb[0]:
+                    line['bbox'][0] = lb[0]
+                if x1 > lb[2]:
+                    line['bbox'][2] = lb[2]
+    
+ 
+def __common_pre_proc(blocks, layout_bboxes):
+    """
+    不分语言的，对文本进行预处理
+    """
+    #__add_line_period(blocks, layout_bboxes)
+    __align_text_in_layout(blocks, layout_bboxes)
+    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
+    
+    return aligned_layout_bboxes
+
+def __pre_proc_zh_blocks(blocks, layout_bboxes):
+    """
+    对中文文本进行分段预处理
+    """
+    pass
+
+
+def __pre_proc_en_blocks(blocks, layout_bboxes):
+    """
+    对英文文本进行分段预处理
+    """
+    pass
+
+
+def __group_line_by_layout(blocks, layout_bboxes, lang="en"):
+    """
+    每个layout内的行进行聚合
+    """
+    # 因为只是一个block一行目前, 一个block就是一个段落
+    lines_group = []
+    
+    for lyout in layout_bboxes:
+        lines = [line for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox']) for line in block['lines']]
+        lines_group.append(lines)
+
+    return lines_group
+    
+
+def __split_para_in_layoutbox(lines_group, new_layout_bbox, lang="en", char_avg_len=10):
+    """
+    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
+    1. 先计算每个group的左右边界。
+    2. 然后根据行末尾特征进行分段。
+        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
+        且下一行开头不留空白。
+    
+    """
+    list_info = [] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+    layout_paras = []
+    right_tail_distance = 1.5 * char_avg_len
+    
+    for lines in lines_group:
+        paras = []
+        total_lines = len(lines)
+        if total_lines==0:
+            continue # 0行无需处理
+        if total_lines==1: # 1行无法分段。
+            layout_paras.append([lines])
+            list_info.append([False, False])
+            continue
+        
+        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
+            对齐方式分为以下：
+            1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
+                1) 右侧对齐的行，单独成一段
+                2) 中间对齐的行，按照字体/行高聚合成一段
+            2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
+                这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
+        """
+        
+        text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
+        """根据list_range，把lines分成几个部分
+        
+        """
+        
+        layout_right = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[2]
+        layout_left = __find_layout_bbox_by_line(lines[0]['bbox'], new_layout_bbox)[0]
+        para = [] # 元素是line
+        layout_list_info = [False, False] # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+        for content_type, start, end in text_segments:
+            if content_type == 'list':
+                for i, line in enumerate(lines[start:end+1]):
+                    line_x0 = line['bbox'][0]
+                    if line_x0 == layout_left: # 列表开头
+                        if len(para)>0:
+                            paras.append(para)
+                            para = []
+                        para.append(line)
+                    else:
+                        para.append(line)
+                if len(para)>0:
+                    paras.append(para)
+                    para = []
+                if start==0:
+                    layout_list_info[0] = True
+                if end==total_lines-1:
+                    layout_list_info[1] = True
+            else: # 是普通文本
+                for i, line in enumerate(lines[start:end+1]):
+                    # 如果i有下一行，那么就要根据下一行位置综合判断是否要分段。如果i之后没有行，那么只需要判断i行自己的结尾特征。
+                    cur_line_type = line['spans'][-1]['type']
+                    next_line = lines[i+1] if i<total_lines-1 else None
+                    
+                    if cur_line_type in [TEXT, INLINE_EQUATION]:
+                        if line['bbox'][2] < layout_right - right_tail_distance:
+                            para.append(line)
+                            paras.append(para)
+                            para = []
+                        elif line['bbox'][2] >= layout_right - right_tail_distance and next_line and next_line['bbox'][0] == layout_left: # 现在这行到了行尾沾满，下一行存在且顶格。
+                            para.append(line)
+                        else: 
+                            para.append(line)
+                            paras.append(para)
+                            para = []
+                    else: # 其他，图片、表格、行间公式，各自占一段
+                        if len(para)>0:  # 先把之前的段落加入到结果中
+                            paras.append(para)
+                            para = []
+                        paras.append([line]) # 再把当前行加入到结果中。当前行为行间公式、图、表等。
+                        para = []
+                        
+                if len(para)>0:
+                    paras.append(para)
+                    para = []
+                
+        list_info.append(layout_list_info)
+        layout_paras.append(paras)
+        paras = []
+                
+                    
+    return layout_paras, list_info
+
+def __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang):
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(layout_paras)==0 or len(layout_list_info)==0: # 0的时候最后的return 会出错
+        return layout_paras, [False, False]
+        
+    for i in range(1, len(layout_paras)):
+        pre_layout_list_info = layout_list_info[i-1]
+        next_layout_list_info = layout_list_info[i]
+        pre_last_para = layout_paras[i-1][-1]
+        next_paras = layout_paras[i]
+        next_first_para = next_paras[0]
+        
+        if pre_layout_list_info[1] and not next_layout_list_info[0]: # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+            logger.info(f"连接page {page_num} 内的list")
+            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+            may_list_lines = []
+            for j in range(len(next_paras)):
+                line = next_paras[j]
+                if len(line)==1: # 只可能是一行，多行情况再需要分析了
+                    if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], new_layout_bbox)[0]:
+                        may_list_lines.append(line[0])
+                    else:
+                        break
+                else:
+                    break
+            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+            if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1:
+                pre_last_para.extend(may_list_lines)
+                layout_paras[i] = layout_paras[i][len(may_list_lines):]
+                           
+    return layout_paras, [layout_list_info[0][0], layout_list_info[-1][1]] # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
+
+
+def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,  pre_page_list_info, next_page_list_info, page_num, lang):
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(pre_page_paras)==0 or len(next_page_paras)==0: # 0的时候最后的return 会出错
+        return False
+    
+    if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+        logger.info(f"连接page {page_num} 内的list")
+        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+        may_list_lines = []
+        for j in range(len(next_page_paras[0])):
+            line = next_page_paras[0][j]
+            if len(line)==1: # 只可能是一行，多行情况再需要分析了
+                if line[0]['bbox'][0] > __find_layout_bbox_by_line(line[0]['bbox'], next_page_layout_bbox)[0]:
+                    may_list_lines.append(line[0])
+                else:
+                    break
+            else:
+                break
+        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+        if len(may_list_lines)>0 and len(set([x['bbox'][0] for x in may_list_lines]))==1:
+            pre_page_paras[-1].append(may_list_lines)
+            next_page_paras[0] = next_page_paras[0][len(may_list_lines):]
+            return True
+                       
+    return False
+
+
+def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
+    """
+    根据line找到所在的layout
+    """
+    for layout in layout_bboxes:
+        if is_in_layout(line_bbox, layout):
+            return layout
+    return None
+
+
+def __connect_para_inter_layoutbox(layout_paras, new_layout_bbox, lang):
+    """
+    layout之间进行分段。
+    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
+    连接的条件需要同时满足：
+    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
+    2. 下一行开头不留空白。
+
+    """
+    connected_layout_paras = []
+    if len(layout_paras)==0:
+        return connected_layout_paras
+    
+    connected_layout_paras.append(layout_paras[0])
+    for i in range(1, len(layout_paras)):
+        try:
+            if len(layout_paras[i])==0 or len(layout_paras[i-1])==0: #  TODO 考虑连接问题，
+                continue
+            pre_last_line = layout_paras[i-1][-1][-1]
+            next_first_line = layout_paras[i][0][0]
+        except Exception as e:
+            logger.error(f"page layout {i} has no line")
+            continue
+        pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+        pre_last_line_type = pre_last_line['spans'][-1]['type']
+        next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+        next_first_line_type = next_first_line['spans'][0]['type']
+        if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
+            connected_layout_paras.append(layout_paras[i])
+            continue
+        
+        pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)[2]
+        next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)[0]
+        
+        pre_last_line_text = pre_last_line_text.strip()
+        next_first_line_text = next_first_line_text.strip()
+        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+            connected_layout_paras[-1][-1].extend(layout_paras[i][0])
+            layout_paras[i].pop(0) # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
+            if len(layout_paras[i])==0:
+                layout_paras.pop(i)
+            else:
+                connected_layout_paras.append(layout_paras[i])
+        else:                            
+            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
+            connected_layout_paras.append(layout_paras[i])
+    
+    return connected_layout_paras
+
+
+def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang):
+    """
+    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
+    是否可以连接的条件：
+    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
+    2. 后一个页面的第一个段落第一行没有空白开头。
+    """
+    # 有的页面可能压根没有文字
+    if len(pre_page_paras)==0 or len(next_page_paras)==0 or len(pre_page_paras[0])==0 or len(next_page_paras[0])==0: # TODO [[]]为什么出现在pre_page_paras里？
+        return False
+    pre_last_para = pre_page_paras[-1][-1]
+    next_first_para = next_page_paras[0][0]
+    pre_last_line = pre_last_para[-1]
+    next_first_line = next_first_para[0]
+    pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+    pre_last_line_type = pre_last_line['spans'][-1]['type']
+    next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+    next_first_line_type = next_first_line['spans'][0]['type']
+    
+    if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]: # TODO，真的要做好，要考虑跨table, image, 行间的情况
+        # 不是文本，不连接
+        return False
+    
+    pre_x2_max = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)[2]
+    next_x0_min = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)[0]
+    
+    pre_last_line_text = pre_last_line_text.strip()
+    next_first_line_text = next_first_line_text.strip()
+    if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and next_first_line['bbox'][0]==next_x0_min: # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+        pre_last_para.extend(next_first_para)
+        next_page_paras[0].pop(0) # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
+        return True
+    else:
+        return False
+
+def find_consecutive_true_regions(input_array):
+    start_index = None  # 连续True区域的起始索引
+    regions = []  # 用于保存所有连续True区域的起始和结束索引
+
+    for i in range(len(input_array)):
+        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
+        if input_array[i] and start_index is None:
+            start_index = i  # 记录连续True区域的起始索引
+
+        # 如果我们找到了一个False值，并且当前在连续True区域中
+        elif not input_array[i] and start_index is not None:
+            # 如果连续True区域长度大于1，那么将其添加到结果列表中
+            if i - start_index > 1: 
+                regions.append((start_index, i-1)) 
+            start_index = None  # 重置起始索引
+
+    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
+    if start_index is not None and len(input_array) - start_index > 1:
+        regions.append((start_index, len(input_array)-1))
+
+    return regions
+
+
+def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode):
+    """
+    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
+    一个line居中的条件是：
+    1. 水平中心点跨越layout的中心点。
+    2. 左右两侧都有空白
+    """
+    
+    for layout_i, layout_para in enumerate(page_paras):
+        layout_box = new_layout_bbox[layout_i]
+        single_line_paras_tag = []
+        for i in range(len(layout_para)):
+            single_line_paras_tag.append(len(layout_para[i])==1 and layout_para[i][0]['spans'][0]['type']==TEXT)
+            
+        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
+        consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
+        if len(consecutive_single_line_indices)>0:
+            index_offset = 0
+            """检查这些行是否是高度相同的，居中的"""
+            for start, end in consecutive_single_line_indices:
+                start += index_offset
+                end += index_offset
+                line_hi = np.array([line[0]['bbox'][3]-line[0]['bbox'][1] for line in layout_para[start:end+1]])
+                first_line_text = ''.join([__get_span_text(span) for span in layout_para[start][0]['spans']])
+                if "Table" in first_line_text or "Figure" in first_line_text:
+                    pass
+                if debug_mode:
+                    logger.debug(line_hi.std())
+                
+                if line_hi.std()<2:
+                    """行高度相同，那么判断是否居中"""
+                    all_left_x0 = [line[0]['bbox'][0] for line in layout_para[start:end+1]]
+                    all_right_x1 = [line[0]['bbox'][2] for line in layout_para[start:end+1]]
+                    layout_center = (layout_box[0] + layout_box[2]) / 2
+                    if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
+                    and not all([x0==layout_box[0] for x0 in all_left_x0]) \
+                    and not all([x1==layout_box[2] for x1 in all_right_x1]):
+                        merge_para = [l[0] for l in layout_para[start:end+1]]
+                        para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
+                        if debug_mode:
+                            logger.debug(para_text)
+                        layout_para[start:end+1] = [merge_para]
+                        index_offset -= end-start
+                        
+    return
+            
+
+def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
+    """
+    找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。
+    """
+    
+    pass
+
+
+def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
+    """
+    根据line和layout情况进行分段
+    先实现一个根据行末尾特征分段的简单方法。
+    """
+    """
+    算法思路：
+    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
+    2. 从上述行中找到末尾是句号等可作为断行标志的行。
+    3. 参照上述行尾特征进行分段。
+    4. 图、表，目前独占一行，不考虑分段。
+    """
+    if page_num==343:
+        pass
+    lines_group = __group_line_by_layout(blocks, layout_bboxes, lang) # block内分段
+    layout_paras, layout_list_info = __split_para_in_layoutbox(lines_group, new_layout_bbox, lang) # layout内分段
+    layout_paras2, page_list_info = __connect_list_inter_layout(layout_paras, new_layout_bbox, layout_list_info, page_num, lang) # layout之间连接列表段落
+    connected_layout_paras = __connect_para_inter_layoutbox(layout_paras2, new_layout_bbox, lang) # layout间链接段落
+    
+    
+    return connected_layout_paras, page_list_info
+       
+    
+def para_split(pdf_info_dict, debug_mode, lang="en"):
+    """
+    根据line和layout情况进行分段
+    """
+    new_layout_of_pages = [] # 数组的数组，每个元素是一个页面的layoutS
+    all_page_list_info = [] # 保存每个页面开头和结尾是否是列表
+    for page_num, page in pdf_info_dict.items():
+        blocks = page['preproc_blocks']
+        layout_bboxes = page['layout_bboxes']
+        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
+        new_layout_of_pages.append(new_layout_bbox)
+        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
+        all_page_list_info.append(page_list_info)
+        page['para_blocks'] = splited_blocks
+        
+    """连接页面与页面之间的可能合并的段落"""
+    pdf_infos = list(pdf_info_dict.values())
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        if page_num==0:
+            continue
+        pre_page_paras = pdf_infos[page_num-1]['para_blocks']
+        next_page_paras = pdf_infos[page_num]['para_blocks']
+        pre_page_layout_bbox = new_layout_of_pages[page_num-1]
+        next_page_layout_bbox = new_layout_of_pages[page_num]
+        
+        is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num, lang)
+        if debug_mode:
+            if is_conn:
+                logger.info(f"连接了第{page_num-1}页和第{page_num}页的段落")
+            
+        is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, all_page_list_info[page_num-1], all_page_list_info[page_num], page_num, lang)
+        if debug_mode:
+            if is_list_conn:
+                logger.info(f"连接了第{page_num-1}页和第{page_num}页的列表段落")
+            
+    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
+    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
+    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
+    """
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        page_paras = page['para_blocks']
+        new_layout_bbox = new_layout_of_pages[page_num]
+        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode)
+        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
diff --git a/magic_pdf/para/para_split_v2.py b/magic_pdf/para/para_split_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9dcee3a20d69fd3d30b1136d0528e2673a92c0
--- /dev/null
+++ b/magic_pdf/para/para_split_v2.py
@@ -0,0 +1,787 @@
+from sklearn.cluster import DBSCAN
+import numpy as np
+from loguru import logger
+import re
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap_with_area_ratio as is_in_layout
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.libs.Constants import *
+
+LINE_STOP_FLAG = ['.', '!', '?', '。', '！', '？', "：", ":", ")", "）", ";"]
+INLINE_EQUATION = ContentType.InlineEquation
+INTERLINE_EQUATION = ContentType.InterlineEquation
+TEXT = ContentType.Text
+debug_able = False
+
+
+def __get_span_text(span):
+    c = span.get('content', '')
+    if len(c) == 0:
+        c = span.get('image_path', '')
+
+    return c
+
+
+def __detect_list_lines(lines, new_layout_bboxes, lang):
+    global debug_able
+    """
+    探测是否包含了列表，并且把列表的行分开.
+    这样的段落特点是，顶格字母大写/数字，紧跟着几行缩进的。缩进的行首字母含小写的。
+    """
+
+    def find_repeating_patterns2(lst):
+        indices = []
+        ones_indices = []
+        i = 0
+        while i < len(lst):  # Loop through the entire list
+            if lst[i] == 1:  # If we encounter a '1', we might be at the start of a pattern
+                start = i
+                ones_in_this_interval = [i]
+                i += 1
+                # Traverse elements that are 1, 2 or 3, until we encounter something else
+                while i < len(lst) and lst[i] in [1, 2, 3]:
+                    if lst[i] == 1:
+                        ones_in_this_interval.append(i)
+                    i += 1
+                if len(ones_in_this_interval) > 1 or (
+                        start < len(lst) - 1 and ones_in_this_interval and lst[start + 1] in [2, 3]):
+                    indices.append((start, i - 1))
+                    ones_indices.append(ones_in_this_interval)
+            else:
+                i += 1
+        return indices, ones_indices
+
+    def find_repeating_patterns(lst):
+        indices = []
+        ones_indices = []
+        i = 0
+        while i < len(lst) - 1:  # 确保余下元素至少有2个
+            if lst[i] == 1 and lst[i + 1] in [2, 3]:  # 额外检查以防止连续出现的1
+                start = i
+                ones_in_this_interval = [i]
+                i += 1
+                while i < len(lst) and lst[i] in [2, 3]:
+                    i += 1
+                # 验证下一个序列是否符合条件
+                if i < len(lst) - 1 and lst[i] == 1 and lst[i + 1] in [2, 3] and lst[i - 1] in [2, 3]:
+                    while i < len(lst) and lst[i] in [1, 2, 3]:
+                        if lst[i] == 1:
+                            ones_in_this_interval.append(i)
+                        i += 1
+                    indices.append((start, i - 1))
+                    ones_indices.append(ones_in_this_interval)
+                else:
+                    i += 1
+            else:
+                i += 1
+        return indices, ones_indices
+
+    """===================="""
+
+    def split_indices(slen, index_array):
+        result = []
+        last_end = 0
+
+        for start, end in sorted(index_array):
+            if start > last_end:
+                # 前一个区间结束到下一个区间开始之间的部分标记为"text"
+                result.append(('text', last_end, start - 1))
+            # 区间内标记为"list"
+            result.append(('list', start, end))
+            last_end = end + 1
+
+        if last_end < slen:
+            # 如果最后一个区间结束后还有剩余的字符串，将其标记为"text"
+            result.append(('text', last_end, slen - 1))
+
+        return result
+
+    """===================="""
+
+    if lang != 'en':
+        return lines, None
+    else:
+        total_lines = len(lines)
+        line_fea_encode = []
+        """
+        对每一行进行特征编码，编码规则如下：
+        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+        2. 如果顶格，其他非大写开头编码为4
+        3. 如果非顶格，首字符大写，编码为2
+        4. 如果非顶格，首字符非大写编码为3
+        """
+        if len(lines) > 0:
+            x_map_tag_dict, min_x_tag = cluster_line_x(lines)
+        for l in lines:
+            span_text = __get_span_text(l['spans'][0])
+            first_char = span_text[0]
+            layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
+            if not layout:
+                line_fea_encode.append(0)
+            else:
+                #
+                if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
+                    # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
+                    if not first_char.isalnum() or if_match_reference_list(span_text):
+                        line_fea_encode.append(1)
+                    else:
+                        line_fea_encode.append(4)
+                else:
+                    if first_char.isupper():
+                        line_fea_encode.append(2)
+                    else:
+                        line_fea_encode.append(3)
+
+        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+
+        list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
+        if len(list_indice) > 0:
+            if debug_able:
+                logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+
+        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+        segments = []
+        for start, end in list_indice:
+            for i in range(start, end + 1):
+                if i > 0:
+                    if line_fea_encode[i] == 4:
+                        if debug_able:
+                            logger.info(f"列表行的第{i}行不是顶格的")
+                        break
+            else:
+                if debug_able:
+                    logger.info(f"列表行的第{start}到第{end}行是列表")
+
+        return split_indices(total_lines, list_indice), list_start_idx
+
+
+def cluster_line_x(lines: list) -> dict:
+    """
+    对一个block内所有lines的bbox的x0聚类
+    """
+    min_distance = 5
+    min_sample = 1
+    x0_lst = np.array([[round(line['bbox'][0]), 0] for line in lines])
+    x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+    x0_uniq_label = np.unique(x0_clusters.labels_)
+    #x1_lst = np.array([[line['bbox'][2], 0] for line in lines])
+    x0_2_new_val = {}  # 存储旧值对应的新值映射
+    min_x0 = round(lines[0]["bbox"][0])
+    for label in x0_uniq_label:
+        if label == -1:
+            continue
+        x0_index_of_label = np.where(x0_clusters.labels_ == label)
+        x0_raw_val = x0_lst[x0_index_of_label][:, 0]
+        x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
+        x0_2_new_val.update({round(raw_val): round(x0_new_val) for raw_val in x0_raw_val})
+        if x0_new_val < min_x0:
+            min_x0 = x0_new_val
+    return x0_2_new_val, min_x0
+
+
+def if_match_reference_list(text: str) -> bool:
+    pattern = re.compile(r'^\d+\..*')
+    if pattern.match(text):
+        return True
+    else:
+        return False
+
+
+def __valign_lines(blocks, layout_bboxes):
+    """
+    在一个layoutbox内对齐行的左侧和右侧。
+    扫描行的左侧和右侧，如果x0, x1差距不超过一个阈值，就强行对齐到所处layout的左右两侧（和layout有一段距离）。
+    3是个经验值，TODO，计算得来，可以设置为1.5个正文字符。
+    """
+
+    min_distance = 3
+    min_sample = 2
+    new_layout_bboxes = []
+
+    for layout_box in layout_bboxes:
+        blocks_in_layoutbox = [b for b in blocks if
+                               b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
+        if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
+            new_layout_bboxes.append(layout_box['layout_bbox'])
+            continue
+
+        x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x1_lst = np.array([[line['bbox'][2], 0] for block in blocks_in_layoutbox for line in block['lines']])
+        x0_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x0_lst)
+        x1_clusters = DBSCAN(eps=min_distance, min_samples=min_sample).fit(x1_lst)
+        x0_uniq_label = np.unique(x0_clusters.labels_)
+        x1_uniq_label = np.unique(x1_clusters.labels_)
+
+        x0_2_new_val = {}  # 存储旧值对应的新值映射
+        x1_2_new_val = {}
+        for label in x0_uniq_label:
+            if label == -1:
+                continue
+            x0_index_of_label = np.where(x0_clusters.labels_ == label)
+            x0_raw_val = x0_lst[x0_index_of_label][:, 0]
+            x0_new_val = np.min(x0_lst[x0_index_of_label][:, 0])
+            x0_2_new_val.update({idx: x0_new_val for idx in x0_raw_val})
+        for label in x1_uniq_label:
+            if label == -1:
+                continue
+            x1_index_of_label = np.where(x1_clusters.labels_ == label)
+            x1_raw_val = x1_lst[x1_index_of_label][:, 0]
+            x1_new_val = np.max(x1_lst[x1_index_of_label][:, 0])
+            x1_2_new_val.update({idx: x1_new_val for idx in x1_raw_val})
+
+        for block in blocks_in_layoutbox:
+            for line in block['lines']:
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 in x0_2_new_val:
+                    line['bbox'][0] = int(x0_2_new_val[x0])
+
+                if x1 in x1_2_new_val:
+                    line['bbox'][2] = int(x1_2_new_val[x1])
+            # 其余对不齐的保持不动
+
+        # 由于修改了block里的line长度，现在需要重新计算block的bbox
+        for block in blocks_in_layoutbox:
+            if len(block["lines"]) > 0:
+                block['bbox'] = [min([line['bbox'][0] for line in block['lines']]),
+                                 min([line['bbox'][1] for line in block['lines']]),
+                                 max([line['bbox'][2] for line in block['lines']]),
+                                 max([line['bbox'][3] for line in block['lines']])]
+
+        """新计算layout的bbox，因为block的bbox变了。"""
+        layout_x0 = min([block['bbox'][0] for block in blocks_in_layoutbox])
+        layout_y0 = min([block['bbox'][1] for block in blocks_in_layoutbox])
+        layout_x1 = max([block['bbox'][2] for block in blocks_in_layoutbox])
+        layout_y1 = max([block['bbox'][3] for block in blocks_in_layoutbox])
+        new_layout_bboxes.append([layout_x0, layout_y0, layout_x1, layout_y1])
+
+    return new_layout_bboxes
+
+
+def __align_text_in_layout(blocks, layout_bboxes):
+    """
+    由于ocr出来的line，有时候会在前后有一段空白，这个时候需要对文本进行对齐，超出的部分被layout左右侧截断。
+    """
+    for layout in layout_bboxes:
+        lb = layout['layout_bbox']
+        blocks_in_layoutbox = [block for block in blocks if
+                               block["type"] == BlockType.Text and is_in_layout(block['bbox'], lb)]
+        if len(blocks_in_layoutbox) == 0:
+            continue
+
+        for block in blocks_in_layoutbox:
+            for line in block.get("lines", []):
+                x0, x1 = line['bbox'][0], line['bbox'][2]
+                if x0 < lb[0]:
+                    line['bbox'][0] = lb[0]
+                if x1 > lb[2]:
+                    line['bbox'][2] = lb[2]
+
+
+def __common_pre_proc(blocks, layout_bboxes):
+    """
+    不分语言的，对文本进行预处理
+    """
+    # __add_line_period(blocks, layout_bboxes)
+    __align_text_in_layout(blocks, layout_bboxes)
+    aligned_layout_bboxes = __valign_lines(blocks, layout_bboxes)
+
+    return aligned_layout_bboxes
+
+
+def __pre_proc_zh_blocks(blocks, layout_bboxes):
+    """
+    对中文文本进行分段预处理
+    """
+    pass
+
+
+def __pre_proc_en_blocks(blocks, layout_bboxes):
+    """
+    对英文文本进行分段预处理
+    """
+    pass
+
+
+def __group_line_by_layout(blocks, layout_bboxes):
+    """
+    每个layout内的行进行聚合
+    """
+    # 因为只是一个block一行目前, 一个block就是一个段落
+    blocks_group = []
+    for lyout in layout_bboxes:
+        blocks_in_layout = [block for block in blocks if is_in_layout(block['bbox'], lyout['layout_bbox'])]
+        blocks_group.append(blocks_in_layout)
+    return blocks_group
+
+
+def __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang="en"):
+    """
+    lines_group 进行行分段——layout内部进行分段。lines_group内每个元素是一个Layoutbox内的所有行。
+    1. 先计算每个group的左右边界。
+    2. 然后根据行末尾特征进行分段。
+        末尾特征：以句号等结束符结尾。并且距离右侧边界有一定距离。
+        且下一行开头不留空白。
+
+    """
+    list_info = []  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+    for blocks in blocks_group:
+        is_start_list = None
+        is_end_list = None
+        if len(blocks) == 0:
+            list_info.append([False, False])
+            continue
+        if blocks[0]["type"] != BlockType.Text and blocks[-1]["type"] != BlockType.Text:
+            list_info.append([False, False])
+            continue
+        if blocks[0]["type"] != BlockType.Text:
+            is_start_list = False
+        if blocks[-1]["type"] != BlockType.Text:
+            is_end_list = False
+
+        lines = [line for block in blocks if
+                 block["type"] == BlockType.Text for line in
+                 block['lines']]
+        total_lines = len(lines)
+        if total_lines == 1 or total_lines == 0:
+            list_info.append([False, False])
+            continue
+        """在进入到真正的分段之前，要对文字块从统计维度进行对齐方式的探测，
+                    对齐方式分为以下：
+                    1. 左对齐的文本块(特点是左侧顶格，或者左侧不顶格但是右侧顶格的行数大于非顶格的行数，顶格的首字母有大写也有小写)
+                        1) 右侧对齐的行，单独成一段
+                        2) 中间对齐的行，按照字体/行高聚合成一段
+                    2. 左对齐的列表块（其特点是左侧顶格的行数小于等于非顶格的行数，非定格首字母会有小写，顶格90%是大写。并且左侧顶格行数大于1，大于1是为了这种模式连续出现才能称之为列表）
+                        这样的文本块，顶格的为一个段落开头，紧随其后非顶格的行属于这个段落。
+        """
+        text_segments, list_start_line = __detect_list_lines(lines, new_layout_bbox, lang)
+        """根据list_range，把lines分成几个部分
+
+        """
+        for list_start in list_start_line:
+            if len(list_start) > 1:
+                for i in range(0, len(list_start)):
+                    index = list_start[i] - 1
+                    if index >= 0:
+                        if "content" in lines[index]["spans"][-1]:
+                            lines[index]["spans"][-1]["content"] += '\n\n'
+        layout_list_info = [False, False]  # 这个layout最后是不是列表,记录每一个layout里是不是列表开头，列表结尾
+        for content_type, start, end in text_segments:
+            if content_type == 'list':
+                if start == 0 and is_start_list is None:
+                    layout_list_info[0] = True
+                if end == total_lines - 1 and is_end_list is None:
+                    layout_list_info[1] = True
+
+        list_info.append(layout_list_info)
+    return list_info
+
+
+def __split_para_lines(lines: list, text_blocks: list) -> list:
+    text_paras = []
+    other_paras = []
+    text_lines = []
+    for line in lines:
+
+        spans_types = [span["type"] for span in line]
+        if ContentType.Table in spans_types:
+            other_paras.append([line])
+            continue
+        if ContentType.Image in spans_types:
+            other_paras.append([line])
+            continue
+        if ContentType.InterlineEquation in spans_types:
+            other_paras.append([line])
+            continue
+        text_lines.append(line)
+
+    for block in text_blocks:
+        block_bbox = block["bbox"]
+        para = []
+        for line in text_lines:
+            bbox = line["bbox"]
+            if is_in_layout(bbox, block_bbox):
+                para.append(line)
+        if len(para) > 0:
+            text_paras.append(para)
+    paras = other_paras.extend(text_paras)
+    paras_sorted = sorted(paras, key=lambda x: x[0]["bbox"][1])
+    return paras_sorted
+
+
+def __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info, page_num, lang):
+    global debug_able
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(blocks_group) == 0 or len(blocks_group) == 0:  # 0的时候最后的return 会出错
+        return blocks_group, [False, False]
+
+    for i in range(1, len(blocks_group)):
+        if len(blocks_group[i]) == 0 or len(blocks_group[i - 1]) == 0:
+            continue
+        pre_layout_list_info = layout_list_info[i - 1]
+        next_layout_list_info = layout_list_info[i]
+        pre_last_para = blocks_group[i - 1][-1].get("lines", [])
+        next_paras = blocks_group[i]
+        next_first_para = next_paras[0]
+
+        if pre_layout_list_info[1] and not next_layout_list_info[0] and next_first_para[
+            "type"] == BlockType.Text:  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+            if debug_able:
+                logger.info(f"连接page {page_num} 内的list")
+            # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+            may_list_lines = []
+            lines = next_first_para.get("lines", [])
+
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], new_layout_bbox)[0]:
+                    may_list_lines.append(line)
+                else:
+                    break
+            # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+            if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
+                pre_last_para.extend(may_list_lines)
+                next_first_para["lines"] = next_first_para["lines"][len(may_list_lines):]
+
+    return blocks_group, [layout_list_info[0][0], layout_list_info[-1][1]]  # 同时还返回了这个页面级别的开头、结尾是不是列表的信息
+
+
+def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox,
+                              pre_page_list_info, next_page_list_info, page_num, lang):
+    """
+    如果上个layout的最后一个段落是列表，下一个layout的第一个段落也是列表，那么将他们连接起来。 TODO 因为没有区分列表和段落，所以这个方法暂时不实现。
+    根据layout_list_info判断是不是列表。，下个layout的第一个段如果不是列表，那么看他们是否有几行都有相同的缩进。
+    """
+    if len(pre_page_paras) == 0 or len(next_page_paras) == 0:  # 0的时候最后的return 会出错
+        return False
+    if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
+        return False
+    if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
+        return False
+    if pre_page_list_info[1] and not next_page_list_info[0]:  # 前一个是列表结尾，后一个是非列表开头，此时检测是否有相同的缩进
+        if debug_able:
+            logger.info(f"连接page {page_num} 内的list")
+        # 向layout_paras[i] 寻找开头具有相同缩进的连续的行
+        may_list_lines = []
+        next_page_first_para = next_page_paras[0][0]
+        if next_page_first_para["type"] == BlockType.Text:
+            lines = next_page_first_para["lines"]
+            for line in lines:
+                if line['bbox'][0] > __find_layout_bbox_by_line(line['bbox'], next_page_layout_bbox)[0]:
+                    may_list_lines.append(line)
+                else:
+                    break
+        # 如果这些行的缩进是相等的，那么连到上一个layout的最后一个段落上。
+        if len(may_list_lines) > 0 and len(set([x['bbox'][0] for x in may_list_lines])) == 1:
+            #pre_page_paras[-1].append(may_list_lines)
+            # 下一页合并到上一页最后一段，打一个cross_page的标签
+            for line in may_list_lines:
+                for span in line["spans"]:
+                    span[CROSS_PAGE] = True
+            pre_page_paras[-1][-1]["lines"].extend(may_list_lines)
+            next_page_first_para["lines"] = next_page_first_para["lines"][len(may_list_lines):]
+            return True
+
+    return False
+
+
+def __find_layout_bbox_by_line(line_bbox, layout_bboxes):
+    """
+    根据line找到所在的layout
+    """
+    for layout in layout_bboxes:
+        if is_in_layout(line_bbox, layout):
+            return layout
+    return None
+
+
+def __connect_para_inter_layoutbox(blocks_group, new_layout_bbox):
+    """
+    layout之间进行分段。
+    主要是计算前一个layOut的最后一行和后一个layout的第一行是否可以连接。
+    连接的条件需要同时满足：
+    1. 上一个layout的最后一行沾满整个行。并且没有结尾符号。
+    2. 下一行开头不留空白。
+
+    """
+    connected_layout_blocks = []
+    if len(blocks_group) == 0:
+        return connected_layout_blocks
+
+    connected_layout_blocks.append(blocks_group[0])
+    for i in range(1, len(blocks_group)):
+        try:
+            if len(blocks_group[i]) == 0:
+                continue
+            if len(blocks_group[i - 1]) == 0:  # TODO 考虑连接问题，
+                connected_layout_blocks.append(blocks_group[i])
+                continue
+            # text类型的段才需要考虑layout间的合并
+            if blocks_group[i - 1][-1]["type"] != BlockType.Text or blocks_group[i][0]["type"] != BlockType.Text:
+                connected_layout_blocks.append(blocks_group[i])
+                continue
+            if len(blocks_group[i - 1][-1]["lines"]) == 0 or len(blocks_group[i][0]["lines"]) == 0:
+                connected_layout_blocks.append(blocks_group[i])
+                continue
+            pre_last_line = blocks_group[i - 1][-1]["lines"][-1]
+            next_first_line = blocks_group[i][0]["lines"][0]
+        except Exception as e:
+            logger.error(f"page layout {i} has no line")
+            continue
+        pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+        pre_last_line_type = pre_last_line['spans'][-1]['type']
+        next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+        next_first_line_type = next_first_line['spans'][0]['type']
+        if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT, INLINE_EQUATION]:
+            #connected_layout_paras.append(layout_paras[i])
+            connected_layout_blocks.append(blocks_group[i])
+            continue
+        pre_layout = __find_layout_bbox_by_line(pre_last_line['bbox'], new_layout_bbox)
+        next_layout = __find_layout_bbox_by_line(next_first_line['bbox'], new_layout_bbox)
+
+        pre_x2_max = pre_layout[2] if pre_layout else -1
+        next_x0_min = next_layout[0] if next_layout else -1
+
+        pre_last_line_text = pre_last_line_text.strip()
+        next_first_line_text = next_first_line_text.strip()
+        if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text and pre_last_line_text[
+            -1] not in LINE_STOP_FLAG and \
+                next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+            """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+            #connected_layout_paras[-1][-1].extend(layout_paras[i][0])
+            connected_layout_blocks[-1][-1]["lines"].extend(blocks_group[i][0]["lines"])
+            #layout_paras[i].pop(0)  # 删除后一个layout的第一个段落， 因为他已经被合并到前一个layout的最后一个段落了。
+            blocks_group[i][0]["lines"] = []  #删除后一个layout第一个段落中的lines，因为他已经被合并到前一个layout的最后一个段落了
+            blocks_group[i][0][LINES_DELETED] = True
+            # if len(layout_paras[i]) == 0:
+            #     layout_paras.pop(i)
+            # else:
+            #     connected_layout_paras.append(layout_paras[i])
+            connected_layout_blocks.append(blocks_group[i])
+        else:
+            """连接段落条件不成立，将前一个layout的段落加入到结果中。"""
+            #connected_layout_paras.append(layout_paras[i])
+            connected_layout_blocks.append(blocks_group[i])
+    return connected_layout_blocks
+
+
+def __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox, next_page_layout_bbox, page_num,
+                              lang):
+    """
+    连接起来相邻两个页面的段落——前一个页面最后一个段落和后一个页面的第一个段落。
+    是否可以连接的条件：
+    1. 前一个页面的最后一个段落最后一行沾满整个行。并且没有结尾符号。
+    2. 后一个页面的第一个段落第一行没有空白开头。
+    """
+    # 有的页面可能压根没有文字
+    if len(pre_page_paras) == 0 or len(next_page_paras) == 0 or len(pre_page_paras[0]) == 0 or len(
+            next_page_paras[0]) == 0:  # TODO [[]]为什么出现在pre_page_paras里？
+        return False
+    pre_last_block = pre_page_paras[-1][-1]
+    next_first_block = next_page_paras[0][0]
+    if pre_last_block["type"] != BlockType.Text or next_first_block["type"] != BlockType.Text:
+        return False
+    if len(pre_last_block["lines"]) == 0 or len(next_first_block["lines"]) == 0:
+        return False
+    pre_last_para = pre_last_block["lines"]
+    next_first_para = next_first_block["lines"]
+    pre_last_line = pre_last_para[-1]
+    next_first_line = next_first_para[0]
+    pre_last_line_text = ''.join([__get_span_text(span) for span in pre_last_line['spans']])
+    pre_last_line_type = pre_last_line['spans'][-1]['type']
+    next_first_line_text = ''.join([__get_span_text(span) for span in next_first_line['spans']])
+    next_first_line_type = next_first_line['spans'][0]['type']
+
+    if pre_last_line_type not in [TEXT, INLINE_EQUATION] or next_first_line_type not in [TEXT,
+                                                                                         INLINE_EQUATION]:  # TODO，真的要做好，要考虑跨table, image, 行间的情况
+        # 不是文本，不连接
+        return False
+
+    pre_x2_max_bbox = __find_layout_bbox_by_line(pre_last_line['bbox'], pre_page_layout_bbox)
+    if not pre_x2_max_bbox:
+        return False
+    next_x0_min_bbox = __find_layout_bbox_by_line(next_first_line['bbox'], next_page_layout_bbox)
+    if not next_x0_min_bbox:
+        return False
+
+    pre_x2_max = pre_x2_max_bbox[2]
+    next_x0_min = next_x0_min_bbox[0]
+
+    pre_last_line_text = pre_last_line_text.strip()
+    next_first_line_text = next_first_line_text.strip()
+    if pre_last_line['bbox'][2] == pre_x2_max and pre_last_line_text[-1] not in LINE_STOP_FLAG and \
+            next_first_line['bbox'][0] == next_x0_min:  # 前面一行沾满了整个行，并且没有结尾符号.下一行没有空白开头。
+        """连接段落条件成立，将前一个layout的段落和后一个layout的段落连接。"""
+        # 下一页合并到上一页最后一段，打一个cross_page的标签
+        for line in next_first_para:
+            for span in line["spans"]:
+                span[CROSS_PAGE] = True
+        pre_last_para.extend(next_first_para)
+
+        #next_page_paras[0].pop(0)  # 删除后一个页面的第一个段落， 因为他已经被合并到前一个页面的最后一个段落了。
+        next_page_paras[0][0]["lines"] = []
+        next_page_paras[0][0][LINES_DELETED] = True
+        return True
+    else:
+        return False
+
+
+def find_consecutive_true_regions(input_array):
+    start_index = None  # 连续True区域的起始索引
+    regions = []  # 用于保存所有连续True区域的起始和结束索引
+
+    for i in range(len(input_array)):
+        # 如果我们找到了一个True值，并且当前并没有在连续True区域中
+        if input_array[i] and start_index is None:
+            start_index = i  # 记录连续True区域的起始索引
+
+        # 如果我们找到了一个False值，并且当前在连续True区域中
+        elif not input_array[i] and start_index is not None:
+            # 如果连续True区域长度大于1，那么将其添加到结果列表中
+            if i - start_index > 1:
+                regions.append((start_index, i - 1))
+            start_index = None  # 重置起始索引
+
+    # 如果最后一个元素是True，那么需要将最后一个连续True区域加入到结果列表中
+    if start_index is not None and len(input_array) - start_index > 1:
+        regions.append((start_index, len(input_array) - 1))
+
+    return regions
+
+
+def __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang):
+    global debug_able
+    """
+    找出来中间对齐的连续单行文本，如果连续行高度相同，那么合并为一个段落。
+    一个line居中的条件是：
+    1. 水平中心点跨越layout的中心点。
+    2. 左右两侧都有空白
+    """
+
+    for layout_i, layout_para in enumerate(page_paras):
+        layout_box = new_layout_bbox[layout_i]
+        single_line_paras_tag = []
+        for i in range(len(layout_para)):
+            #single_line_paras_tag.append(len(layout_para[i]) == 1 and layout_para[i][0]['spans'][0]['type'] == TEXT)
+            single_line_paras_tag.append(layout_para[i]['type'] == BlockType.Text and len(layout_para[i]["lines"]) == 1)
+        """找出来连续的单行文本，如果连续行高度相同，那么合并为一个段落。"""
+        consecutive_single_line_indices = find_consecutive_true_regions(single_line_paras_tag)
+        if len(consecutive_single_line_indices) > 0:
+            #index_offset = 0
+            """检查这些行是否是高度相同的，居中的"""
+            for start, end in consecutive_single_line_indices:
+                #start += index_offset
+                #end += index_offset
+                line_hi = np.array([block["lines"][0]['bbox'][3] - block["lines"][0]['bbox'][1] for block in
+                                    layout_para[start:end + 1]])
+                first_line_text = ''.join([__get_span_text(span) for span in layout_para[start]["lines"][0]['spans']])
+                if "Table" in first_line_text or "Figure" in first_line_text:
+                    pass
+                if debug_able:
+                    logger.info(line_hi.std())
+
+                if line_hi.std() < 2:
+                    """行高度相同，那么判断是否居中"""
+                    all_left_x0 = [block["lines"][0]['bbox'][0] for block in layout_para[start:end + 1]]
+                    all_right_x1 = [block["lines"][0]['bbox'][2] for block in layout_para[start:end + 1]]
+                    layout_center = (layout_box[0] + layout_box[2]) / 2
+                    if all([x0 < layout_center < x1 for x0, x1 in zip(all_left_x0, all_right_x1)]) \
+                            and not all([x0 == layout_box[0] for x0 in all_left_x0]) \
+                            and not all([x1 == layout_box[2] for x1 in all_right_x1]):
+                        merge_para = [block["lines"][0] for block in layout_para[start:end + 1]]
+                        para_text = ''.join([__get_span_text(span) for line in merge_para for span in line['spans']])
+                        if debug_able:
+                            logger.info(para_text)
+                        layout_para[start]["lines"] = merge_para
+                        for i_para in range(start + 1, end + 1):
+                            layout_para[i_para]["lines"] = []
+                            layout_para[i_para][LINES_DELETED] = True
+                        #layout_para[start:end + 1] = [merge_para]
+
+                        #index_offset -= end - start
+
+    return
+
+
+def __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang):
+    """
+    找出来连续的单行文本，如果首行顶格，接下来的几个单行段落缩进对齐，那么合并为一个段落。
+    """
+
+    pass
+
+
+def __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang):
+    """
+    根据line和layout情况进行分段
+    先实现一个根据行末尾特征分段的简单方法。
+    """
+    """
+    算法思路：
+    1. 扫描layout里每一行，找出来行尾距离layout有边界有一定距离的行。
+    2. 从上述行中找到末尾是句号等可作为断行标志的行。
+    3. 参照上述行尾特征进行分段。
+    4. 图、表，目前独占一行，不考虑分段。
+    """
+    blocks_group = __group_line_by_layout(blocks, layout_bboxes)  # block内分段
+    layout_list_info = __split_para_in_layoutbox(blocks_group, new_layout_bbox, lang)  # layout内分段
+    blocks_group, page_list_info = __connect_list_inter_layout(blocks_group, new_layout_bbox, layout_list_info,
+                                                               page_num, lang)  # layout之间连接列表段落
+    connected_layout_blocks = __connect_para_inter_layoutbox(blocks_group, new_layout_bbox)  # layout间链接段落
+
+    return connected_layout_blocks, page_list_info
+
+
+def para_split(pdf_info_dict, debug_mode, lang="en"):
+    global debug_able
+    debug_able = debug_mode
+    new_layout_of_pages = []  # 数组的数组，每个元素是一个页面的layoutS
+    all_page_list_info = []  # 保存每个页面开头和结尾是否是列表
+    for page_num, page in pdf_info_dict.items():
+        blocks = page['preproc_blocks']
+        layout_bboxes = page['layout_bboxes']
+        new_layout_bbox = __common_pre_proc(blocks, layout_bboxes)
+        new_layout_of_pages.append(new_layout_bbox)
+        splited_blocks, page_list_info = __do_split_page(blocks, layout_bboxes, new_layout_bbox, page_num, lang)
+        all_page_list_info.append(page_list_info)
+        page['para_blocks'] = splited_blocks
+
+    """连接页面与页面之间的可能合并的段落"""
+    pdf_infos = list(pdf_info_dict.values())
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        if page_num == 0:
+            continue
+        pre_page_paras = pdf_infos[page_num - 1]['para_blocks']
+        next_page_paras = pdf_infos[page_num]['para_blocks']
+        pre_page_layout_bbox = new_layout_of_pages[page_num - 1]
+        next_page_layout_bbox = new_layout_of_pages[page_num]
+
+        is_conn = __connect_para_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
+                                            next_page_layout_bbox, page_num, lang)
+        if debug_able:
+            if is_conn:
+                logger.info(f"连接了第{page_num - 1}页和第{page_num}页的段落")
+
+        is_list_conn = __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_bbox,
+                                                 next_page_layout_bbox, all_page_list_info[page_num - 1],
+                                                 all_page_list_info[page_num], page_num, lang)
+        if debug_able:
+            if is_list_conn:
+                logger.info(f"连接了第{page_num - 1}页和第{page_num}页的列表段落")
+
+    """接下来可能会漏掉一些特别的一些可以合并的内容，对他们进行段落连接
+    1. 正文中有时出现一个行顶格，接下来几行缩进的情况。
+    2. 居中的一些连续单行，如果高度相同，那么可能是一个段落。
+    """
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        page_paras = page['para_blocks']
+        new_layout_bbox = new_layout_of_pages[page_num]
+        __connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang)
+        __merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
+
+    # layout展平
+    for page_num, page in enumerate(pdf_info_dict.values()):
+        page_paras = page['para_blocks']
+        page_blocks = [block for layout in page_paras for block in layout]
+        page["para_blocks"] = page_blocks
diff --git a/magic_pdf/para/raw_processor.py b/magic_pdf/para/raw_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..edbf9964b88159d898555711571703e74fc56180
--- /dev/null
+++ b/magic_pdf/para/raw_processor.py
@@ -0,0 +1,207 @@
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        span_flags : int
+            span flags
+
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+
+        return l
+
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        raw_lines : list
+            raw lines
+
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+
+            if new_line is None:
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += " " + raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+
+        return new_lines
+
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+
+        Returns
+        -------
+        new_block : dict
+
+        Schema of new_block:
+        {
+            "block_id": "block_1",
+            "bbox": [0, 0, 100, 100],
+            "text": "This is a block.",
+            "lines": [
+                {
+                    "bbox": [0, 0, 100, 100],
+                    "text": "This is a line.",
+                    "spans": [
+                        {
+                            "text": "This is a span.",
+                            "font": "Times New Roman",
+                            "size": 12,
+                            "color": "#000000",
+                        }
+                    ],
+                }
+            ],
+        }
+        """
+        new_block = {}
+
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
diff --git a/magic_pdf/para/stats.py b/magic_pdf/para/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd509b952b8df09d7dcaba4a561a0c9f8caced78
--- /dev/null
+++ b/magic_pdf/para/stats.py
@@ -0,0 +1,268 @@
+from collections import Counter
+import numpy as np
+
+from magic_pdf.para.commons import *
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class BlockStatisticsCalculator:
+    def __init__(self) -> None:
+        pass
+
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+
+        max_freq_font_type = max_span_font_type
+
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+
+    def __make_new_block(self, input_block):
+        new_block = {}
+
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class DocStatisticsCalculator:
+    def __init__(self) -> None:
+        pass
+
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+
+        total_text_length = 0
+        total_num_blocks = 0
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+
+        font_list = []
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+
+        pdf_dict["statistics"] = statistics
+
+        return pdf_dict
+
+
diff --git a/magic_pdf/para/title_processor.py b/magic_pdf/para/title_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d330fcd062b847cd5f8de0d16ea7bbb683837a
--- /dev/null
+++ b/magic_pdf/para/title_processor.py
@@ -0,0 +1,1014 @@
+import os
+import re
+import numpy as np
+
+from magic_pdf.libs.nlp_utils import NLPModels
+
+from magic_pdf.para.commons import *
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+class TitleProcessor:
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+
+            return True
+
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+
+            return True
+
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+
+            Parameters:
+            line_text (str): Text of the line.
+
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+
+            return False
+
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+
+                return len(font_styles) > 1
+
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+
+            return curr_line_font_size >= doc_avg_font_size * ratio
+
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+
+            _, y0, _, y1 = curr_line_bbox
+
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+
+            return (sufficient_spacing_above, sufficient_spacing_below)
+
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+
+            compiled_pattern = re.compile(name_list_pattern)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        # """
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+
+            return result
+
+        # """
+
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+
+            return False
+
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+
+            return False
+
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+
+            Parameters
+            ----------
+            line_text : str
+
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+
+            Parameters
+            ----------
+            text : str
+                text of the current line
+
+            max_length : int
+                max length of the title
+
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+
+            """
+            text = text.strip()
+            return len(text) <= max_length
+
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+
+            Parameters
+            ----------
+            line : dict
+                line
+
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+
+            return max_span_font_type, max_span_font_size
+
+        """
+        Title detecting main Process.
+        """
+
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False
+
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+
+        """
+        Further aggregated features about the current line.
+        
+        Attention:
+            Features that start with __ are for internal use.
+        """
+
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+
+        is_equation = __is_equation(curr_line_text)
+
+        is_title_by_len = __is_title_by_len(curr_line_text)
+
+        """
+        Decide if the line is a title.
+        """
+        # is_title = False
+        # if prev_line_is_title:
+
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # not the same font type as the document average font type, which includes the most common font type and the second most common font type
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )
+        )
+        # ) or (is_similar_to_pre_line and prev_line_is_title)
+
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+            # print curr_line_text to check
+            # print_yellow(f"Text of is_author_or_org_list: {curr_line_text}")
+        else:
+            is_author_or_org_list = False
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+        # print reason why the line is not a title
+        line_text = curr_line_text.strip()
+        test_text = "Career/Personal Life"
+        text_content_condition = line_text == test_text
+        
+        if not is_title and text_content_condition: # Print specific line
+        # if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+
+        """
+
+        return is_title, is_author_or_org_list
+
+    def _detect_block_title(self, input_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+
+        raw_lines = input_block["lines"]
+
+        prev_line_is_title_flag = False
+
+        for i, curr_line in enumerate(raw_lines):
+            prev_line = raw_lines[i - 1] if i > 0 else None
+            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+
+            blk_avg_char_width = input_block["avg_char_width"]
+            blk_avg_char_height = input_block["avg_char_height"]
+            blk_media_font_size = input_block["median_font_size"]
+
+            is_title, is_author_or_org_list = self._is_potential_title(
+                curr_line,
+                prev_line,
+                prev_line_is_title_flag,
+                next_line,
+                blk_avg_char_width,
+                blk_avg_char_height,
+                blk_media_font_size,
+            )
+
+            if is_title:
+                curr_line["is_title"] = is_title
+                prev_line_is_title_flag = True
+            else:
+                curr_line["is_title"] = False
+                prev_line_is_title_flag = False
+
+            if is_author_or_org_list:
+                curr_line["is_author_or_org_list"] = is_author_or_org_list
+            else:
+                curr_line["is_author_or_org_list"] = False
+
+        return input_block
+
+    def batch_process_blocks_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_block = self._detect_block_title(para_block)
+                            new_para_blocks.append(new_block)
+                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+
+                blocks["para_blocks"] = para_blocks
+
+                for para_block in para_blocks:
+                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                    if (
+                        all_titles and para_text_len < 200
+                    ):  # total length of the paragraph is less than 200, more than this should not be a title
+                        para_block["is_block_title"] = 1
+                    else:
+                        para_block["is_block_title"] = 0
+
+                    all_name_or_org_list_to_be_removed = all(
+                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                    )
+                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                        para_block["is_block_an_author_or_org_list"] = 1
+                    else:
+                        para_block["is_block_an_author_or_org_list"] = 0
+
+        pdf_dic["statistics"]["num_titles"] = num_titles
+
+        return pdf_dic
+
+    def __determine_size_based_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+
+        Parameters
+        ----------
+        title_blocks : list
+
+        Returns
+        -------
+        title_blocks : list
+        """
+
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+
+            current_level = 1  # Initialize title level, the biggest level is 1
+
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+
+            title_block["block_title_level"] = current_level
+
+        return title_blocks
+
+    def batch_process_blocks_recog_title_level(self, pdf_dic):
+        title_blocks = []
+
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self.__determine_size_based_level(title_blocks)
+
+        return pdf_dic
diff --git a/magic_pdf/pdf_parse_by_ocr.py b/magic_pdf/pdf_parse_by_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..42d9acbd29e74236ec376d965dda87889e1bd216
--- /dev/null
+++ b/magic_pdf/pdf_parse_by_ocr.py
@@ -0,0 +1,18 @@
+from magic_pdf.pdf_parse_union_core import pdf_parse_union
+
+
+def parse_pdf_by_ocr(pdf_bytes,
+                     model_list,
+                     imageWriter,
+                     start_page_id=0,
+                     end_page_id=None,
+                     debug_mode=False,
+                     ):
+    return pdf_parse_union(pdf_bytes,
+                           model_list,
+                           imageWriter,
+                           "ocr",
+                           start_page_id=start_page_id,
+                           end_page_id=end_page_id,
+                           debug_mode=debug_mode,
+                           )
diff --git a/magic_pdf/pdf_parse_by_txt.py b/magic_pdf/pdf_parse_by_txt.py
new file mode 100644
index 0000000000000000000000000000000000000000..21d11766ae59a19a66855b554b9b606d185daec3
--- /dev/null
+++ b/magic_pdf/pdf_parse_by_txt.py
@@ -0,0 +1,19 @@
+from magic_pdf.pdf_parse_union_core import pdf_parse_union
+
+
+def parse_pdf_by_txt(
+    pdf_bytes,
+    model_list,
+    imageWriter,
+    start_page_id=0,
+    end_page_id=None,
+    debug_mode=False,
+):
+    return pdf_parse_union(pdf_bytes,
+                           model_list,
+                           imageWriter,
+                           "txt",
+                           start_page_id=start_page_id,
+                           end_page_id=end_page_id,
+                           debug_mode=debug_mode,
+                           )
diff --git a/magic_pdf/pdf_parse_for_train.py b/magic_pdf/pdf_parse_for_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..58d88df7d760f5be540a4f46db735ff04f2f86cd
--- /dev/null
+++ b/magic_pdf/pdf_parse_for_train.py
@@ -0,0 +1,685 @@
+import time
+
+# from anyio import Path
+
+from magic_pdf.libs.commons import (
+    fitz,
+    get_delta_time,
+    get_img_s3_client,
+    get_docx_model_output,
+)
+import json
+import os
+from copy import deepcopy
+import math
+from loguru import logger
+from magic_pdf.layout.bbox_sort import (
+    prepare_bboxes_for_layout_split,
+)
+from magic_pdf.layout.layout_sort import (
+    LAYOUT_UNPROC,
+    get_bboxes_layout,
+    get_columns_cnt_of_layout,
+    sort_text_block,
+)
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.markdown_utils import escape_special_markdown_char
+from magic_pdf.libs.safe_filename import sanitize_filename
+from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page
+from magic_pdf.pre_proc.cut_image import txt_save_images_by_bboxes
+from magic_pdf.pre_proc.detect_images import parse_images
+from magic_pdf.pre_proc.detect_tables import parse_tables  # 获取tables的bbox
+from magic_pdf.pre_proc.detect_equation import parse_equations  # 获取equations的bbox
+from magic_pdf.pre_proc.detect_header import parse_headers  # 获取headers的bbox
+from magic_pdf.pre_proc.detect_page_number import parse_pageNos  # 获取pageNos的bbox
+from magic_pdf.pre_proc.detect_footnote import (
+    parse_footnotes_by_model,
+    parse_footnotes_by_rule,
+)  # 获取footnotes的bbox
+from magic_pdf.pre_proc.detect_footer_by_model import parse_footers  # 获取footers的bbox
+
+from magic_pdf.post_proc.detect_para import (
+    ParaProcessPipeline,
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DenseSingleLineBlockException,
+)
+from magic_pdf.pre_proc.main_text_font import get_main_text_font
+from magic_pdf.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock
+from magic_pdf.pre_proc.remove_footer_header import remove_headder_footer_one_page
+from magic_pdf.train_utils.extract_caption import extract_caption_bbox
+
+"""
+from para.para_pipeline import ParaProcessPipeline
+from para.exceptions import (
+    TitleDetectionException,
+    TitleLevelException,
+    ParaSplitException,
+    ParaMergeException,
+    DenseSingleLineBlockException,
+)
+"""
+
+from magic_pdf.libs.commons import read_file, join_path
+from magic_pdf.post_proc.remove_footnote import (
+    merge_footnote_blocks,
+    remove_footnote_blocks,
+)
+from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+from magic_pdf.pre_proc.equations_replace import (
+    combine_chars_to_pymudict,
+    remove_chars_in_text_blocks,
+    replace_equations_in_textblock,
+)
+from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter
+from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header
+from magic_pdf.pre_proc.construct_page_dict import construct_page_component
+from magic_pdf.pre_proc.fix_image import (
+    combine_images,
+    fix_image_vertical,
+    fix_seperated_image,
+    include_img_title,
+)
+from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
+from magic_pdf.pre_proc.remove_rotate_bbox import (
+    get_side_boundry,
+    remove_rotate_side_textblock,
+    remove_side_blank_block,
+)
+from magic_pdf.pre_proc.resolve_bbox_conflict import (
+    check_text_block_horizontal_overlap,
+    resolve_bbox_overlap_conflict,
+)
+from magic_pdf.pre_proc.fix_table import (
+    fix_table_text_block,
+    fix_tables,
+    include_table_title,
+)
+from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
+
+denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
+titleDetectionException_msg = TitleDetectionException().message
+titleLevelException_msg = TitleLevelException().message
+paraSplitException_msg = ParaSplitException().message
+paraMergeException_msg = ParaMergeException().message
+
+
+def parse_pdf_for_train(
+    s3_pdf_path,
+    s3_pdf_profile,
+    pdf_model_output,
+    save_path,
+    book_name,
+    image_s3_config=None,
+    start_page_id=0,
+    end_page_id=None,
+    junk_img_bojids=[],
+    debug_mode=False,
+):
+    pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile)
+    save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
+    md_bookname_save_path = ""
+    book_name = sanitize_filename(book_name)
+    if debug_mode:
+        save_path = join_path(save_tmp_path, "md")
+        pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
+
+        if not os.path.exists(os.path.dirname(pdf_local_path)):
+            # 如果目录不存在，创建它
+            os.makedirs(os.path.dirname(pdf_local_path))
+
+        md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
+        if not os.path.exists(md_bookname_save_path):
+            # 如果目录不存在，创建它
+            os.makedirs(md_bookname_save_path)
+
+        with open(pdf_local_path + ".pdf", "wb") as pdf_file:
+            pdf_file.write(pdf_bytes)
+
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    pdf_info_dict = {}
+    img_s3_client = get_img_s3_client(
+        save_path, image_s3_config
+    )  # 更改函数名和参数，避免歧义
+    # img_s3_client = "img_s3_client"  #不创建这个对象，直接用字符串占位
+
+    start_time = time.time()
+
+    """通过统计pdf全篇文字,识别正文字体"""
+    main_text_font = get_main_text_font(pdf_docs)
+
+    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+    for page_id in range(start_page_id, end_page_id + 1):
+        page = pdf_docs[page_id]
+        page_width = page.rect.width
+        page_height = page.rect.height
+
+        if debug_mode:
+            time_now = time.time()
+            logger.info(
+                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            )
+            start_time = time_now
+        """
+        # 通过一个规则，过滤掉单页超过1500非junkimg的pdf
+        # 对单页面非重复id的img数量做统计,如果当前页超过1500则直接return need_drop
+        """
+        page_imgs = page.get_images()
+        img_counts = 0
+        for img in page_imgs:
+            img_bojid = img[0]
+            if img_bojid in junk_img_bojids:  # 判断这个图片在不在junklist中
+                continue  # 如果在junklist就不用管了，跳过
+            else:
+                recs = page.get_image_rects(img, transform=True)
+                if recs:  # 如果这张图在当前页面有展示
+                    img_counts += 1
+        if (
+            img_counts >= 1500
+        ):  # 如果去除了junkimg的影响，单页img仍然超过1500的话，就排除当前pdf
+            logger.warning(
+                f"page_id: {page_id}, img_counts: {img_counts}, drop this pdf: {book_name}, drop_reason: {DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}"
+            )
+            result = {
+                "_need_drop": True,
+                "_drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS,
+            }
+            if not debug_mode:
+                return result
+
+        """
+        ==================================================================================================================================
+        首先获取基本的block数据，对pdf进行分解，获取图片、表格、公式、text的bbox
+        """
+        # 解析pdf原始文本block
+        text_raw_blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+        )["blocks"]
+        model_output_json = get_docx_model_output(
+            pdf_model_output, page_id
+        )
+
+        # 解析图片
+        image_bboxes = parse_images(page_id, page, model_output_json, junk_img_bojids)
+        image_bboxes = fix_image_vertical(
+            image_bboxes, text_raw_blocks
+        )  # 修正图片的位置
+        image_bboxes = fix_seperated_image(image_bboxes)  # 合并有边重合的图片
+
+        old_image_bboxes = deepcopy(image_bboxes)
+        image_bboxes = include_img_title(
+            text_raw_blocks, image_bboxes
+        )  # 向图片上方和下方寻找title，使用规则进行匹配，暂时只支持英文规则
+        """此时image_bboxes中可能出现这种情况，水平并列的2个图片，下方分别有各自的子标题，2个子标题下方又有大标题（形如Figxxx)，会出现2个图片的bbox都包含了这个大标题，这种情况需要把图片合并"""
+        image_bboxes = combine_images(image_bboxes)  # 合并图片
+
+        # 解析表格并对table_bboxes进行位置的微调,防止表格周围的文字被截断
+        table_bboxes = parse_tables(page_id, page, model_output_json)
+        table_bboxes = fix_tables(
+            page, table_bboxes, include_table_title=False, scan_line_num=2
+        )  # 修正
+        table_bboxes = fix_table_text_block(
+            text_raw_blocks, table_bboxes
+        )  # 修正与text block的关系,某些table修正与pymupdf获取到的table内textblock没有完全包含，因此要进行一次修正。
+        # debug_show_bbox(pdf_docs, page_id, table_bboxes, [], [b['bbox'] for b in text_raw_blocks], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7)
+
+        old_table_bboxes = deepcopy(table_bboxes)
+        table_bboxes = include_table_title(
+            text_raw_blocks, table_bboxes
+        )  # 向table上方和下方寻找title，使用规则进行匹配，暂时只支持英文规则
+
+        # 解析公式
+        equations_inline_bboxes, equations_interline_bboxes = parse_equations(
+            page_id, page, model_output_json
+        )
+
+        # get image box and caption !
+        image_bboxes_with_caption = extract_caption_bbox(image_bboxes, old_image_bboxes)
+
+        # get table box and caption !
+        table_bboxes_with_caption = extract_caption_bbox(table_bboxes, old_table_bboxes)
+
+        """
+        ==================================================================================================================================
+        进入预处理-1阶段
+        -------------------
+        # # 解析标题
+        # title_bboxs = parse_titles(page_id, page, model_output_json)
+        # # 评估Layout是否规整、简单
+        # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json)
+        接下来开始进行预处理过程
+        """
+        # title_bboxs = parse_titles(page_id, page, model_output_json)
+        
+        """去掉每页的页码、页眉、页脚"""
+        page_no_bboxs = parse_pageNos(page_id, page, model_output_json)
+        header_bboxs = parse_headers(page_id, page, model_output_json)
+        footer_bboxs = parse_footers(page_id, page, model_output_json)
+        (
+            image_bboxes,
+            table_bboxes,
+            remain_text_blocks,
+            removed_hdr_foot_txt_block,
+            removed_hdr_foot_img_block,
+            removed_hdr_foot_table,
+        ) = remove_headder_footer_one_page(
+            text_raw_blocks,
+            image_bboxes,
+            table_bboxes,
+            header_bboxs,
+            footer_bboxs,
+            page_no_bboxs,
+            page_width,
+            page_height,
+        )
+
+        """去除页面上半部分长条色块内的文本块"""
+        remain_text_blocks, removed_colored_narrow_strip_background_text_block = (
+            remove_colored_strip_textblock(remain_text_blocks, page)
+        )
+
+        # debug_show_bbox(pdf_docs, page_id, footnote_bboxes_by_model, [b['bbox'] for b in remain_text_blocks], header_bboxs, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7)
+
+        """去掉旋转的文字：水印、垂直排列的文字"""
+        remain_text_blocks, removed_non_horz_text_block = remove_rotate_side_textblock(
+            remain_text_blocks, page_width, page_height
+        )  # 去掉水印，非水平文字
+        remain_text_blocks, removed_empty_side_block = remove_side_blank_block(
+            remain_text_blocks, page_width, page_height
+        )  # 删除页面四周可能会留下的完全空白的textblock，这种block形成原因未知
+
+        """出现在图片、表格上的文字块去掉，把层叠的图片单独分离出来，不参与layout的计算"""
+        (
+            image_bboxes,
+            table_bboxes,
+            equations_interline_bboxes,
+            equations_inline_bboxes,
+            remain_text_blocks,
+            text_block_on_image_removed,
+            images_overlap_backup,
+            interline_eq_temp_text_block,
+        ) = resolve_bbox_overlap_conflict(
+            image_bboxes,
+            table_bboxes,
+            equations_interline_bboxes,
+            equations_inline_bboxes,
+            remain_text_blocks,
+        )
+
+        # """去掉footnote, 从文字和图片中"""
+        # # 通过模型识别到的footnote
+        # footnote_bboxes_by_model = parse_footnotes_by_model(page_id, page, model_output_json, md_bookname_save_path,
+        #                                                     debug_mode=debug_mode)
+        # # 通过规则识别到的footnote
+        # footnote_bboxes_by_rule = parse_footnotes_by_rule(remain_text_blocks, page_height, page_id)
+        """
+        ==================================================================================================================================
+        """
+        if debug_mode:  # debugmode截图到本地
+            save_path = join_path(save_tmp_path, "md")
+
+        # 把图、表、公式都进行截图，保存到存储上，返回图片路径作为内容
+        image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = (
+            txt_save_images_by_bboxes(
+                book_name,
+                page_id,
+                page,
+                save_path,
+                image_bboxes,
+                images_overlap_backup,
+                table_bboxes,
+                equations_inline_bboxes,
+                equations_interline_bboxes,
+                # 传入img_s3_client
+                img_s3_client,
+            )
+        )  # 只要表格和图片的截图
+
+        """"以下进入到公式替换环节 """
+        char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
+            "blocks"
+        ]
+        remain_text_blocks = combine_chars_to_pymudict(
+            remain_text_blocks, char_level_text_blocks
+        )  # 合并chars
+        remain_text_blocks = replace_equations_in_textblock(
+            remain_text_blocks, inline_eq_info, interline_eq_info
+        )
+        remain_text_blocks = remove_citation_marker(
+            remain_text_blocks
+        )  # 公式替换之后去角标，防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。
+        remain_text_blocks = remove_chars_in_text_blocks(
+            remain_text_blocks
+        )  # 减少中间态数据体积
+        # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3)
+
+        """去掉footnote, 从文字和图片中(先去角标再去footnote试试)"""
+        # 通过模型识别到的footnote
+        footnote_bboxes_by_model = parse_footnotes_by_model(
+            page_id,
+            page,
+            model_output_json,
+            md_bookname_save_path,
+            debug_mode=debug_mode,
+        )
+        # 通过规则识别到的footnote
+        footnote_bboxes_by_rule = parse_footnotes_by_rule(
+            remain_text_blocks, page_height, page_id, main_text_font
+        )
+        """进入pdf过滤器，去掉一些不合理的pdf"""
+        is_good_pdf, err = pdf_filter(
+            page, remain_text_blocks, table_bboxes, image_bboxes
+        )
+        if not is_good_pdf:
+            logger.warning(
+                f"page_id: {page_id}, drop this pdf: {book_name}, reason: {err}"
+            )
+            if not debug_mode:
+                return err
+
+        """
+        ==================================================================================================================================
+        进行版面布局切分和过滤
+        """
+        """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
+
+        is_text_block_horz_overlap = check_text_block_horizontal_overlap(
+            remain_text_blocks, header_bboxs, footer_bboxs
+        )
+
+        if is_text_block_horz_overlap:
+            # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in remain_text_blocks], [], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 0)
+            logger.warning(
+                f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}"
+            )
+            result = {
+                "_need_drop": True,
+                "_drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP,
+            }
+            if not debug_mode:
+                return result
+
+        """统一格式化成一个数据结构用于计算layout"""
+        page_y0 = 0 if len(header_bboxs) == 0 else max([b[3] for b in header_bboxs])
+        page_y1 = (
+            page_height if len(footer_bboxs) == 0 else min([b[1] for b in footer_bboxs])
+        )
+        left_x, right_x = get_side_boundry(
+            removed_non_horz_text_block, page_width, page_height
+        )
+        page_boundry = [
+            math.floor(left_x),
+            page_y0 + 1,
+            math.ceil(right_x),
+            page_y1 - 1,
+        ]
+        # 返回的是一个数组，每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y], 初始时候idx_x, idx_y都是None. 对于图片、公式来说，block_content是图片的地址， 对于段落来说，block_content是段落的内容
+
+        all_bboxes = prepare_bboxes_for_layout_split(
+            image_info,
+            image_backup_info,
+            table_info,
+            inline_eq_info,
+            interline_eq_info,
+            remain_text_blocks,
+            page_boundry,
+            page,
+        )
+        # debug_show_bbox(pdf_docs, page_id, [], [], all_bboxes, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 1)
+        """page_y0, page_y1能够过滤掉页眉和页脚，不会算作layout内"""
+        layout_bboxes, layout_tree = get_bboxes_layout(
+            all_bboxes, page_boundry, page_id
+        )
+
+        if (
+            len(remain_text_blocks) > 0
+            and len(all_bboxes) > 0
+            and len(layout_bboxes) == 0
+        ):
+            logger.warning(
+                f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}"
+            )
+            result = {
+                "_need_drop": True,
+                "_drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT,
+            }
+            if not debug_mode:
+                return result
+
+        """以下去掉复杂的布局和超过2列的布局"""
+        if any(
+            [lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]
+        ):  # 复杂的布局
+            logger.warning(
+                f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.COMPLICATED_LAYOUT}"
+            )
+            result = {"_need_drop": True, "_drop_reason": DropReason.COMPLICATED_LAYOUT}
+            if not debug_mode:
+                return result
+
+        layout_column_width = get_columns_cnt_of_layout(layout_tree)
+        if layout_column_width > 2:  # 去掉超过2列的布局pdf
+            logger.warning(
+                f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}"
+            )
+            result = {
+                "_need_drop": True,
+                "_drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS,
+                "extra_info": {"column_cnt": layout_column_width},
+            }
+            if not debug_mode:
+                return result
+
+        """
+        ==================================================================================================================================
+        构造出下游需要的数据结构
+        """
+        remain_text_blocks = (
+            remain_text_blocks + interline_eq_temp_text_block
+        )  # 把计算layout时候临时删除的行间公式再放回去，防止行间公式替换的时候丢失。
+        removed_text_blocks = []
+        removed_text_blocks.extend(removed_hdr_foot_txt_block)
+        # removed_text_blocks.extend(removed_footnote_text_block)
+        removed_text_blocks.extend(text_block_on_image_removed)
+        removed_text_blocks.extend(removed_non_horz_text_block)
+        removed_text_blocks.extend(removed_colored_narrow_strip_background_text_block)
+
+        removed_images = []
+        # removed_images.extend(footnote_imgs)
+        removed_images.extend(removed_hdr_foot_img_block)
+
+        images_backup = []
+        images_backup.extend(image_backup_info)
+        remain_text_blocks = escape_special_markdown_char(
+            remain_text_blocks
+        )  # 转义span里的text
+        sorted_text_remain_text_block = sort_text_block(
+            remain_text_blocks, layout_bboxes
+        )
+
+        footnote_bboxes_tmp = []
+        footnote_bboxes_tmp.extend(footnote_bboxes_by_model)
+        footnote_bboxes_tmp.extend(footnote_bboxes_by_rule)
+
+        page_info = construct_page_component(
+            page_id,
+            image_info,
+            table_info,
+            sorted_text_remain_text_block,
+            layout_bboxes,
+            inline_eq_info,
+            interline_eq_info,
+            page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"],
+            removed_text_blocks=removed_text_blocks,
+            removed_image_blocks=removed_images,
+            images_backup=images_backup,
+            droped_table_block=[],
+            table_backup=[],
+            layout_tree=layout_tree,
+            page_w=page.rect.width,
+            page_h=page.rect.height,
+            footnote_bboxes_tmp=footnote_bboxes_tmp,
+        )
+
+        page_info["image_bboxes_with_caption"] = image_bboxes_with_caption  # add by xr
+        page_info["table_bboxes_with_caption"] = table_bboxes_with_caption
+
+        page_info["bak_page_no_bboxes"] = page_no_bboxs
+        page_info["bak_header_bboxes"] = header_bboxs
+        page_info["bak_footer_bboxes"] = footer_bboxs
+        page_info["bak_footer_note_bboxes"] = footnote_bboxes_tmp
+
+        pdf_info_dict[f"page_{page_id}"] = page_info
+
+    # end page for
+
+    """计算后处理阶段耗时"""
+    start_time = time.time()
+
+    """
+    ==================================================================================================================================
+    去掉页眉和页脚，这里需要用到一定的统计量，所以放到最后
+    页眉和页脚主要从文本box和图片box中去除，位于页面的四周。
+    下面函数会直接修改pdf_info_dict,从文字块中、图片中删除属于页眉页脚的内容，删除内容做相对应记录
+    """
+    # 去页眉页脚
+    header, footer = drop_footer_header(
+        pdf_info_dict
+    )  # TODO: using header and footer boxes here !
+
+    """对单个layout内footnote和他下面的所有textbbox合并"""
+
+    for page_key, page_info in pdf_info_dict.items():
+        page_info = merge_footnote_blocks(page_info, main_text_font)
+        page_info = remove_footnote_blocks(page_info)
+        pdf_info_dict[page_key] = page_info
+
+    """进入pdf后置过滤器，去掉一些不合理的pdf"""
+
+    i = 0
+    for page_info in pdf_info_dict.values():
+        is_good_pdf, err = pdf_post_filter(page_info)
+        if not is_good_pdf:
+            logger.warning(f"page_id: {i}, drop this pdf: {book_name}, reason: {err}")
+            if not debug_mode:
+                return err
+        i += 1
+
+    if debug_mode:
+        params_file_save_path = join_path(
+            save_tmp_path, "md", book_name, "preproc_out.json"
+        )
+        page_draw_rect_save_path = join_path(
+            save_tmp_path, "md", book_name, "layout.pdf"
+        )
+        # dir_path = os.path.dirname(page_draw_rect_save_path)
+        # if not os.path.exists(dir_path):
+        #     # 如果目录不存在，创建它
+        #     os.makedirs(dir_path)
+
+        with open(params_file_save_path, "w", encoding="utf-8") as f:
+            json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+        # 先检测本地 page_draw_rect_save_path 是否存在，如果存在则删除
+        if os.path.exists(page_draw_rect_save_path):
+            os.remove(page_draw_rect_save_path)
+        # 绘制bbox和layout到pdf
+        draw_bbox_on_page(pdf_docs, pdf_info_dict, page_draw_rect_save_path)
+        draw_layout_bbox_on_page(
+            pdf_docs, pdf_info_dict, header, footer, page_draw_rect_save_path
+        )
+
+    if debug_mode:
+        # 打印后处理阶段耗时
+        logger.info(f"post_processing_time: {get_delta_time(start_time)}")
+
+    """
+    ==================================================================================================================================
+    进入段落处理-2阶段
+    """
+
+    # 处理行内文字间距较大问题
+    pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
+
+    start_time = time.time()
+
+    para_process_pipeline = ParaProcessPipeline()
+
+    def _deal_with_text_exception(error_info):
+        logger.warning(
+            f"page_id: {page_id}, drop this pdf: {book_name}, reason: {error_info}"
+        )
+        if error_info == denseSingleLineBlockException_msg:
+            logger.warning(
+                f"Drop this pdf: {book_name}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}"
+            )
+            result = {
+                "_need_drop": True,
+                "_drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK,
+            }
+            return result
+        if error_info == titleDetectionException_msg:
+            logger.warning(
+                f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_DETECTION_FAILED}"
+            )
+            result = {
+                "_need_drop": True,
+                "_drop_reason": DropReason.TITLE_DETECTION_FAILED,
+            }
+            return result
+        elif error_info == titleLevelException_msg:
+            logger.warning(
+                f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_LEVEL_FAILED}"
+            )
+            result = {"_need_drop": True, "_drop_reason": DropReason.TITLE_LEVEL_FAILED}
+            return result
+        elif error_info == paraSplitException_msg:
+            logger.warning(
+                f"Drop this pdf: {book_name}, reason: {DropReason.PARA_SPLIT_FAILED}"
+            )
+            result = {"_need_drop": True, "_drop_reason": DropReason.PARA_SPLIT_FAILED}
+            return result
+        elif error_info == paraMergeException_msg:
+            logger.warning(
+                f"Drop this pdf: {book_name}, reason: {DropReason.PARA_MERGE_FAILED}"
+            )
+            result = {"_need_drop": True, "_drop_reason": DropReason.PARA_MERGE_FAILED}
+            return result
+
+    if debug_mode:
+        input_pdf_file = f"{pdf_local_path}.pdf"
+        output_dir = f"{save_path}/{book_name}"
+        output_pdf_file = f"{output_dir}/pdf_annos.pdf"
+
+        """
+        Call the para_process_pipeline function to process the pdf_info_dict.
+        
+        Parameters:
+        para_debug_mode: str or None
+            If para_debug_mode is None, the para_process_pipeline will not keep any intermediate results.
+            If para_debug_mode is "simple", the para_process_pipeline will only keep the annos on the pdf and the final results as a json file.
+            If para_debug_mode is "full", the para_process_pipeline will keep all the intermediate results generated during each step.
+        """
+        pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline(
+            pdf_info_dict,
+            para_debug_mode="simple",
+            input_pdf_path=input_pdf_file,
+            output_pdf_path=output_pdf_file,
+        )
+        # 打印段落处理阶段耗时
+        logger.info(f"para_process_time: {get_delta_time(start_time)}")
+
+        # debug的时候不return drop信息
+        if error_info is not None:
+            _deal_with_text_exception(error_info)
+        return pdf_info_dict
+    else:
+        pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline(
+            pdf_info_dict
+        )
+        if error_info is not None:
+            return _deal_with_text_exception(error_info)
+
+    return pdf_info_dict
diff --git a/magic_pdf/pdf_parse_union_core.py b/magic_pdf/pdf_parse_union_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..60de2aa5360f0a72ebb58c92bc13f4913cf52f21
--- /dev/null
+++ b/magic_pdf/pdf_parse_union_core.py
@@ -0,0 +1,241 @@
+import time
+
+from loguru import logger
+
+from magic_pdf.libs.commons import fitz, get_delta_time
+from magic_pdf.layout.layout_sort import get_bboxes_layout, LAYOUT_UNPROC, get_columns_cnt_of_layout
+from magic_pdf.libs.convert_utils import dict_to_list
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.hash_utils import compute_md5
+from magic_pdf.libs.math import float_equal
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.model.magic_model import MagicModel
+from magic_pdf.para.para_split_v2 import para_split
+from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
+from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
+from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
+from magic_pdf.pre_proc.equations_replace import remove_chars_in_text_blocks, replace_equations_in_textblock, \
+    combine_chars_to_pymudict
+from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split
+from magic_pdf.pre_proc.ocr_dict_merge import sort_blocks_by_layout, fill_spans_in_blocks, fix_block_spans, \
+    fix_discarded_block
+from magic_pdf.pre_proc.ocr_span_list_modify import remove_overlaps_min_spans, get_qa_need_list_v2, \
+    remove_overlaps_low_confidence_spans
+from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap
+
+
+def remove_horizontal_overlap_block_which_smaller(all_bboxes):
+    useful_blocks = []
+    for bbox in all_bboxes:
+        useful_blocks.append({
+            "bbox": bbox[:4]
+        })
+    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = check_useful_block_horizontal_overlap(useful_blocks)
+    if is_useful_block_horz_overlap:
+        logger.warning(
+            f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}")
+        for bbox in all_bboxes.copy():
+            if smaller_bbox == bbox[:4]:
+                all_bboxes.remove(bbox)
+
+    return is_useful_block_horz_overlap, all_bboxes
+
+
+def txt_spans_extract(pdf_page, inline_equations, interline_equations):
+    text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
+    char_level_text_blocks = pdf_page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[
+        "blocks"
+    ]
+    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
+    text_blocks = replace_equations_in_textblock(
+        text_blocks, inline_equations, interline_equations
+    )
+    text_blocks = remove_citation_marker(text_blocks)
+    text_blocks = remove_chars_in_text_blocks(text_blocks)
+    spans = []
+    for v in text_blocks:
+        for line in v["lines"]:
+            for span in line["spans"]:
+                bbox = span["bbox"]
+                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
+                    continue
+                if span.get('type') not in (ContentType.InlineEquation, ContentType.InterlineEquation):
+                    spans.append(
+                        {
+                            "bbox": list(span["bbox"]),
+                            "content": span["text"],
+                            "type": ContentType.Text,
+                            "score": 1.0,
+                        }
+                    )
+    return spans
+
+
+def replace_text_span(pymu_spans, ocr_spans):
+    return list(filter(lambda x: x["type"] != ContentType.Text, ocr_spans)) + pymu_spans
+
+
+def parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode):
+    need_drop = False
+    drop_reason = []
+
+    '''从magic_model对象中获取后面会用到的区块信息'''
+    img_blocks = magic_model.get_imgs(page_id)
+    table_blocks = magic_model.get_tables(page_id)
+    discarded_blocks = magic_model.get_discarded(page_id)
+    text_blocks = magic_model.get_text_blocks(page_id)
+    title_blocks = magic_model.get_title_blocks(page_id)
+    inline_equations, interline_equations, interline_equation_blocks = magic_model.get_equations(page_id)
+
+    page_w, page_h = magic_model.get_page_size(page_id)
+
+    spans = magic_model.get_all_spans(page_id)
+
+    '''根据parse_mode，构造spans'''
+    if parse_mode == "txt":
+        """ocr 中文本类的 span 用 pymu spans 替换！"""
+        pymu_spans = txt_spans_extract(
+            pdf_docs[page_id], inline_equations, interline_equations
+        )
+        spans = replace_text_span(pymu_spans, spans)
+    elif parse_mode == "ocr":
+        pass
+    else:
+        raise Exception("parse_mode must be txt or ocr")
+
+    '''删除重叠spans中置信度较低的那些'''
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    '''删除重叠spans中较小的那些'''
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+    '''对image和table截图'''
+    spans = ocr_cut_image_and_table(spans, pdf_docs[page_id], page_id, pdf_bytes_md5, imageWriter)
+
+    '''将所有区块的bbox整理到一起'''
+    # @todo interline_equation_blocks参数不够准，后面切换到interline_equations上
+    if len(interline_equation_blocks) > 0:
+        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equation_blocks, page_w, page_h)
+    else:
+        all_bboxes, all_discarded_blocks, drop_reasons = ocr_prepare_bboxes_for_layout_split(
+            img_blocks, table_blocks, discarded_blocks, text_blocks, title_blocks,
+            interline_equations, page_w, page_h)
+    if len(drop_reasons) > 0:
+        need_drop = True
+        drop_reason.append(DropReason.OVERLAP_BLOCKS_CAN_NOT_SEPARATION)
+
+    '''先处理不需要排版的discarded_blocks'''
+    discarded_block_with_spans, spans = fill_spans_in_blocks(all_discarded_blocks, spans, 0.4)
+    fix_discarded_blocks = fix_discarded_block(discarded_block_with_spans)
+
+    '''如果当前页面没有bbox则跳过'''
+    if len(all_bboxes) == 0:
+        logger.warning(f"skip this page, not found useful bbox, page_id: {page_id}")
+        return ocr_construct_page_component_v2([], [], page_id, page_w, page_h, [],
+                                               [], [], interline_equations, fix_discarded_blocks,
+                                               need_drop, drop_reason)
+
+    """在切分之前，先检查一下bbox是否有左右重叠的情况，如果有，那么就认为这个pdf暂时没有能力处理好，这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
+
+    while True:  # 循环检查左右重叠的情况，如果存在就删除掉较小的那个bbox，直到不存在左右重叠的情况
+        is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
+        if is_useful_block_horz_overlap:
+            need_drop = True
+            drop_reason.append(DropReason.USEFUL_BLOCK_HOR_OVERLAP)
+        else:
+            break
+
+    '''根据区块信息计算layout'''
+    page_boundry = [0, 0, page_w, page_h]
+    layout_bboxes, layout_tree = get_bboxes_layout(all_bboxes, page_boundry, page_id)
+
+    if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
+        logger.warning(
+            f"skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
+        need_drop = True
+        drop_reason.append(DropReason.CAN_NOT_DETECT_PAGE_LAYOUT)
+
+    """以下去掉复杂的布局和超过2列的布局"""
+    if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]):  # 复杂的布局
+        logger.warning(
+            f"skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
+        need_drop = True
+        drop_reason.append(DropReason.COMPLICATED_LAYOUT)
+
+    layout_column_width = get_columns_cnt_of_layout(layout_tree)
+    if layout_column_width > 2:  # 去掉超过2列的布局pdf
+        logger.warning(
+            f"skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
+        need_drop = True
+        drop_reason.append(DropReason.TOO_MANY_LAYOUT_COLUMNS)
+
+    '''根据layout顺序，对当前页面所有需要留下的block进行排序'''
+    sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
+
+    '''将span填入排好序的blocks中'''
+    block_with_spans, spans = fill_spans_in_blocks(sorted_blocks, spans, 0.6)
+
+    '''对block进行fix操作'''
+    fix_blocks = fix_block_spans(block_with_spans, img_blocks, table_blocks)
+
+    '''获取QA需要外置的list'''
+    images, tables, interline_equations = get_qa_need_list_v2(fix_blocks)
+
+    '''构造pdf_info_dict'''
+    page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                                images, tables, interline_equations, fix_discarded_blocks,
+                                                need_drop, drop_reason)
+    return page_info
+
+
+def pdf_parse_union(pdf_bytes,
+                    model_list,
+                    imageWriter,
+                    parse_mode,
+                    start_page_id=0,
+                    end_page_id=None,
+                    debug_mode=False,
+                    ):
+    pdf_bytes_md5 = compute_md5(pdf_bytes)
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+
+    '''初始化空的pdf_info_dict'''
+    pdf_info_dict = {}
+
+    '''用model_list和docs对象初始化magic_model'''
+    magic_model = MagicModel(model_list, pdf_docs)
+
+    '''根据输入的起始范围解析pdf'''
+    end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
+
+    '''初始化启动时间'''
+    start_time = time.time()
+
+    for page_id in range(start_page_id, end_page_id + 1):
+
+        '''debug时输出每页解析的耗时'''
+        if debug_mode:
+            time_now = time.time()
+            logger.info(
+                f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
+            )
+            start_time = time_now
+
+        '''解析pdf中的每一页'''
+        page_info = parse_page_core(pdf_docs, magic_model, page_id, pdf_bytes_md5, imageWriter, parse_mode)
+        pdf_info_dict[f"page_{page_id}"] = page_info
+
+    """分段"""
+    para_split(pdf_info_dict, debug_mode=debug_mode)
+
+    """dict转list"""
+    pdf_info_list = dict_to_list(pdf_info_dict)
+    new_pdf_info_dict = {
+        "pdf_info": pdf_info_list,
+    }
+
+    return new_pdf_info_dict
+
+
+if __name__ == '__main__':
+    pass
diff --git a/magic_pdf/pipe/AbsPipe.py b/magic_pdf/pipe/AbsPipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..f802a9a55d3c9c5745f40ff8be239f1b29565644
--- /dev/null
+++ b/magic_pdf/pipe/AbsPipe.py
@@ -0,0 +1,107 @@
+from abc import ABC, abstractmethod
+
+from magic_pdf.dict2md.ocr_mkcontent import union_make
+from magic_pdf.filter.pdf_classify_by_type import classify
+from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
+from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.json_compressor import JsonCompressor
+
+
+class AbsPipe(ABC):
+    """
+    txt和ocr处理的抽象类
+    """
+    PIP_OCR = "ocr"
+    PIP_TXT = "txt"
+
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+        self.pdf_bytes = pdf_bytes
+        self.model_list = model_list
+        self.image_writer = image_writer
+        self.pdf_mid_data = None  # 未压缩
+        self.is_debug = is_debug
+    
+    def get_compress_pdf_mid_data(self):
+        return JsonCompressor.compress_json(self.pdf_mid_data)
+
+    @abstractmethod
+    def pipe_classify(self):
+        """
+        有状态的分类
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def pipe_analyze(self):
+        """
+        有状态的跑模型分析
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def pipe_parse(self):
+        """
+        有状态的解析
+        """
+        raise NotImplementedError
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
+        return content_list
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
+        return md_content
+
+    @staticmethod
+    def classify(pdf_bytes: bytes) -> str:
+        """
+        根据pdf的元数据，判断是文本pdf，还是ocr pdf
+        """
+        pdf_meta = pdf_meta_scan(pdf_bytes)
+        if pdf_meta.get("_need_drop", False):  # 如果返回了需要丢弃的标志，则抛出异常
+            raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
+        else:
+            is_encrypted = pdf_meta["is_encrypted"]
+            is_needs_password = pdf_meta["is_needs_password"]
+            if is_encrypted or is_needs_password:  # 加密的，需要密码的，没有页面的，都不处理
+                raise Exception(f"pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}")
+            else:
+                is_text_pdf, results = classify(
+                    pdf_meta["total_page"],
+                    pdf_meta["page_width_pts"],
+                    pdf_meta["page_height_pts"],
+                    pdf_meta["image_info_per_page"],
+                    pdf_meta["text_len_per_page"],
+                    pdf_meta["imgs_per_page"],
+                    pdf_meta["text_layout_per_page"],
+                    pdf_meta["invalid_chars"],
+                )
+                if is_text_pdf:
+                    return AbsPipe.PIP_TXT
+                else:
+                    return AbsPipe.PIP_OCR
+
+    @staticmethod
+    def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
+        """
+        根据pdf类型，生成统一格式content_list
+        """
+        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
+        pdf_info_list = pdf_mid_data["pdf_info"]
+        content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
+        return content_list
+
+    @staticmethod
+    def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
+        """
+        根据pdf类型，markdown
+        """
+        pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
+        pdf_info_list = pdf_mid_data["pdf_info"]
+        md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
+        return md_content
+
+
diff --git a/magic_pdf/pipe/OCRPipe.py b/magic_pdf/pipe/OCRPipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..a46bdfd9788bb58d0594b65b47d60ef5465fb8c3
--- /dev/null
+++ b/magic_pdf/pipe/OCRPipe.py
@@ -0,0 +1,32 @@
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.pipe.AbsPipe import AbsPipe
+from magic_pdf.user_api import parse_ocr_pdf
+
+
+class OCRPipe(AbsPipe):
+
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
+
+    def pipe_classify(self):
+        pass
+
+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+
+    def pipe_parse(self):
+        self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("ocr_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"ocr_pipe mk {md_make_mode} finished")
+        return result
diff --git a/magic_pdf/pipe/TXTPipe.py b/magic_pdf/pipe/TXTPipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..647b50699c217cc25a73fdac1bb038c8f661f109
--- /dev/null
+++ b/magic_pdf/pipe/TXTPipe.py
@@ -0,0 +1,33 @@
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.json_compressor import JsonCompressor
+from magic_pdf.pipe.AbsPipe import AbsPipe
+from magic_pdf.user_api import parse_txt_pdf
+
+
+class TXTPipe(AbsPipe):
+
+    def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: AbsReaderWriter, is_debug: bool = False):
+        super().__init__(pdf_bytes, model_list, image_writer, is_debug)
+
+    def pipe_classify(self):
+        pass
+
+    def pipe_analyze(self):
+        self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+
+    def pipe_parse(self):
+        self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug)
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("txt_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"txt_pipe mk {md_make_mode} finished")
+        return result
diff --git a/magic_pdf/pipe/UNIPipe.py b/magic_pdf/pipe/UNIPipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..243a8964edd33796e0f0be20ee29f2e7e603fa50
--- /dev/null
+++ b/magic_pdf/pipe/UNIPipe.py
@@ -0,0 +1,85 @@
+import json
+
+from loguru import logger
+
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+from magic_pdf.libs.commons import join_path
+from magic_pdf.pipe.AbsPipe import AbsPipe
+from magic_pdf.user_api import parse_union_pdf, parse_ocr_pdf
+
+
+class UNIPipe(AbsPipe):
+
+    def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: AbsReaderWriter, is_debug: bool = False):
+        self.pdf_type = jso_useful_key["_pdf_type"]
+        super().__init__(pdf_bytes, jso_useful_key["model_list"], image_writer, is_debug)
+        if len(self.model_list) == 0:
+            self.input_model_is_empty = True
+        else:
+            self.input_model_is_empty = False
+
+    def pipe_classify(self):
+        self.pdf_type = AbsPipe.classify(self.pdf_bytes)
+
+    def pipe_analyze(self):
+        if self.pdf_type == self.PIP_TXT:
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=False)
+        elif self.pdf_type == self.PIP_OCR:
+            self.model_list = doc_analyze(self.pdf_bytes, ocr=True)
+
+    def pipe_parse(self):
+        if self.pdf_type == self.PIP_TXT:
+            self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
+                                                is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty)
+        elif self.pdf_type == self.PIP_OCR:
+            self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
+                                              is_debug=self.is_debug)
+
+    def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
+        result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
+        logger.info("uni_pipe mk content list finished")
+        return result
+
+    def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
+        result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
+        logger.info(f"uni_pipe mk {md_make_mode} finished")
+        return result
+
+
+if __name__ == '__main__':
+    # 测试
+    drw = DiskReaderWriter(r"D:/project/20231108code-clean")
+
+    pdf_file_path = r"linshixuqiu\19983-00.pdf"
+    model_file_path = r"linshixuqiu\19983-00.json"
+    pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
+    model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
+    model_list = json.loads(model_json_txt)
+    write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
+    img_bucket_path = "imgs"
+    img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
+
+    # pdf_type = UNIPipe.classify(pdf_bytes)
+    # jso_useful_key = {
+    #     "_pdf_type": pdf_type,
+    #     "model_list": model_list
+    # }
+
+    jso_useful_key = {
+        "_pdf_type": "",
+        "model_list": model_list
+    }
+    pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
+    pipe.pipe_classify()
+    pipe.pipe_parse()
+    md_content = pipe.pipe_mk_markdown(img_bucket_path)
+    content_list = pipe.pipe_mk_uni_format(img_bucket_path)
+
+    md_writer = DiskReaderWriter(write_path)
+    md_writer.write(md_content, "19983-00.md", AbsReaderWriter.MODE_TXT)
+    md_writer.write(json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4), "19983-00.json",
+                    AbsReaderWriter.MODE_TXT)
+    md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
diff --git a/magic_pdf/pipe/__init__.py b/magic_pdf/pipe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/post_proc/__init__.py b/magic_pdf/post_proc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/post_proc/detect_para.py b/magic_pdf/post_proc/detect_para.py
new file mode 100644
index 0000000000000000000000000000000000000000..17b41d27cff8c87ab9bdd4f040e9ed14904f91cd
--- /dev/null
+++ b/magic_pdf/post_proc/detect_para.py
@@ -0,0 +1,3472 @@
+import os
+import sys
+import json
+import re
+import math
+import unicodedata
+from collections import Counter
+
+
+import numpy as np
+from termcolor import cprint
+
+
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.nlp_utils import NLPModels
+
+
+if sys.version_info[0] >= 3:
+    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
+
+
+def open_pdf(pdf_path):
+    try:
+        pdf_document = fitz.open(pdf_path)  # type: ignore
+        return pdf_document
+    except Exception as e:
+        print(f"无法打开PDF文件：{pdf_path}。原因是：{e}")
+        raise e
+
+
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+
+
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+
+
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+
+
+def print_yellow(text):
+    print()
+    cprint(text, "yellow", attrs=["bold"], end="\n\n")
+
+
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+
+
+def is_bbox_overlap(bbox1, bbox2):
+    """
+    This function checks if bbox1 and bbox2 overlap or not
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 and bbox2 overlap, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 > x1_2 or x0_2 > x1_1:
+        return False
+    if y0_1 > y1_2 or y0_2 > y1_1:
+        return False
+
+    return True
+
+
+def is_in_bbox(bbox1, bbox2):
+    """
+    This function checks if bbox1 is in bbox2
+
+    Parameters
+    ----------
+    bbox1 : list
+        bbox1
+    bbox2 : list
+        bbox2
+
+    Returns
+    -------
+    bool
+        True if bbox1 is in bbox2, else False
+    """
+    x0_1, y0_1, x1_1, y1_1 = bbox1
+    x0_2, y0_2, x1_2, y1_2 = bbox2
+
+    if x0_1 >= x0_2 and y0_1 >= y0_2 and x1_1 <= x1_2 and y1_1 <= y1_2:
+        return True
+    else:
+        return False
+
+
+def calculate_para_bbox(lines):
+    """
+    This function calculates the minimum bbox of the paragraph
+
+    Parameters
+    ----------
+    lines : list
+        lines
+
+    Returns
+    -------
+    para_bbox : list
+        bbox of the paragraph
+    """
+    x0 = min(line["bbox"][0] for line in lines)
+    y0 = min(line["bbox"][1] for line in lines)
+    x1 = max(line["bbox"][2] for line in lines)
+    y1 = max(line["bbox"][3] for line in lines)
+    return [x0, y0, x1, y1]
+
+
+def is_line_right_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is right aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is right aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    _, _, x1, _ = curr_line_bbox
+    _, _, prev_x1, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    _, _, next_x1, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x1 - prev_x1) < horizontal_thres
+    elif direction == 1:
+        return abs(x1 - next_x1) < horizontal_thres
+    elif direction == 2:
+        return abs(x1 - prev_x1) < horizontal_thres and abs(x1 - next_x1) < horizontal_thres
+    else:
+        return False
+
+
+def is_line_left_aligned_from_neighbors(curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, direction=2):
+    """
+    This function checks if the line is left aligned from its neighbors
+
+    Parameters
+    ----------
+    curr_line_bbox : list
+        bbox of the current line
+    prev_line_bbox : list
+        bbox of the previous line
+    next_line_bbox : list
+        bbox of the next line
+    avg_char_width : float
+        average of char widths
+    direction : int
+        0 for prev, 1 for next, 2 for both
+
+    Returns
+    -------
+    bool
+        True if the line is left aligned from its neighbors, False otherwise.
+    """
+    horizontal_ratio = 0.5
+    horizontal_thres = horizontal_ratio * avg_char_width
+
+    x0, _, _, _ = curr_line_bbox
+    prev_x0, _, _, _ = prev_line_bbox if prev_line_bbox else (0, 0, 0, 0)
+    next_x0, _, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+    if direction == 0:
+        return abs(x0 - prev_x0) < horizontal_thres
+    elif direction == 1:
+        return abs(x0 - next_x0) < horizontal_thres
+    elif direction == 2:
+        return abs(x0 - prev_x0) < horizontal_thres and abs(x0 - next_x0) < horizontal_thres
+    else:
+        return False
+
+
+def end_with_punctuation(line_text):
+    """
+    This function checks if the line ends with punctuation marks
+    """
+
+    english_end_puncs = [".", "?", "!"]
+    chinese_end_puncs = ["。", "？", "！"]
+    end_puncs = english_end_puncs + chinese_end_puncs
+
+    last_non_space_char = None
+    for ch in line_text[::-1]:
+        if not ch.isspace():
+            last_non_space_char = ch
+            break
+
+    if last_non_space_char is None:
+        return False
+
+    return last_non_space_char in end_puncs
+
+
+def is_nested_list(lst):
+    if isinstance(lst, list):
+        return any(isinstance(sub, list) for sub in lst)
+    return False
+
+
+class DenseSingleLineBlockException(Exception):
+    """
+    This class defines the exception type for dense single line-block.
+    """
+
+    def __init__(self, message="DenseSingleLineBlockException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleDetectionException(Exception):
+    """
+    This class defines the exception type for title detection.
+    """
+
+    def __init__(self, message="TitleDetectionException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class TitleLevelException(Exception):
+    """
+    This class defines the exception type for title level.
+    """
+
+    def __init__(self, message="TitleLevelException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaSplitException(Exception):
+    """
+    This class defines the exception type for paragraph splitting.
+    """
+
+    def __init__(self, message="ParaSplitException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class ParaMergeException(Exception):
+    """
+    This class defines the exception type for paragraph merging.
+    """
+
+    def __init__(self, message="ParaMergeException"):
+        self.message = message
+        super().__init__(self.message)
+
+    def __str__(self):
+        return f"{self.message}"
+
+    def __repr__(self):
+        return f"{self.message}"
+
+
+class DiscardByException:
+    """
+    This class discards pdf files by exception
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def discard_by_single_line_block(self, pdf_dic, exception: DenseSingleLineBlockException):
+        """
+        This function discards pdf files by single line block exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        exception_page_nums = 0
+        page_num = 0
+        for page_id, page in pdf_dic.items():
+            if page_id.startswith("page_"):
+                page_num += 1
+                if "preproc_blocks" in page.keys():
+                    preproc_blocks = page["preproc_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in preproc_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    if len(preproc_blocks) > 0 and len(all_single_line_blocks) / len(preproc_blocks) > 0.9:
+                        exception_page_nums += 1
+
+        if page_num == 0:
+            return None
+
+        if exception_page_nums / page_num > 0.1:  # Low ratio means basically, whenever this is the case, it is discarded
+            return exception.message
+
+        return None
+
+    def discard_by_title_detection(self, pdf_dic, exception: TitleDetectionException):
+        """
+        This function discards pdf files by title detection exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_title_level(self, pdf_dic, exception: TitleLevelException):
+        """
+        This function discards pdf files by title level exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_split_para(self, pdf_dic, exception: ParaSplitException):
+        """
+        This function discards pdf files by split para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+    def discard_by_merge_para(self, pdf_dic, exception: ParaMergeException):
+        """
+        This function discards pdf files by merge para exception
+
+        Parameters
+        ----------
+        pdf_dic : dict
+            pdf dictionary
+        exception : str
+            exception message
+
+        Returns
+        -------
+        error_message : str
+        """
+        # return exception.message
+        return None
+
+
+class LayoutFilterProcessor:
+    def __init__(self) -> None:
+        pass
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        pdf_dict : dict
+            pdf dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            pdf dictionary
+        """
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "layout_bboxes" in blocks.keys() and "para_blocks" in blocks.keys():
+                    layout_bbox_objs = blocks["layout_bboxes"]
+                    if layout_bbox_objs is None:
+                        continue
+                    layout_bboxes = [bbox_obj["layout_bbox"] for bbox_obj in layout_bbox_objs]
+
+                    # Enlarge each value of x0, y0, x1, y1 for each layout_bbox to prevent loss of text.
+                    layout_bboxes = [
+                        [math.ceil(x0), math.ceil(y0), math.ceil(x1), math.ceil(y1)] for x0, y0, x1, y1 in layout_bboxes
+                    ]
+
+                    para_blocks = blocks["para_blocks"]
+                    if para_blocks is None:
+                        continue
+
+                    for lb_bbox in layout_bboxes:
+                        for i, para_block in enumerate(para_blocks):
+                            para_bbox = para_block["bbox"]
+                            para_blocks[i]["in_layout"] = 0
+                            if is_in_bbox(para_bbox, lb_bbox):
+                                para_blocks[i]["in_layout"] = 1
+
+                    blocks["para_blocks"] = para_blocks
+
+        return pdf_dict
+
+
+class RawBlockProcessor:
+    def __init__(self) -> None:
+        self.y_tolerance = 2
+        self.pdf_dic = {}
+
+    def __span_flags_decomposer(self, span_flags):
+        """
+        Make font flags human readable.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        span_flags : int
+            span flags
+
+        Returns
+        -------
+        l : dict
+            decomposed flags
+        """
+
+        l = {
+            "is_superscript": False,
+            "is_italic": False,
+            "is_serifed": False,
+            "is_sans_serifed": False,
+            "is_monospaced": False,
+            "is_proportional": False,
+            "is_bold": False,
+        }
+
+        if span_flags & 2**0:
+            l["is_superscript"] = True  # 表示上标
+
+        if span_flags & 2**1:
+            l["is_italic"] = True  # 表示斜体
+
+        if span_flags & 2**2:
+            l["is_serifed"] = True  # 表示衬线字体
+        else:
+            l["is_sans_serifed"] = True  # 表示非衬线字体
+
+        if span_flags & 2**3:
+            l["is_monospaced"] = True  # 表示等宽字体
+        else:
+            l["is_proportional"] = True  # 表示比例字体
+
+        if span_flags & 2**4:
+            l["is_bold"] = True  # 表示粗体
+
+        return l
+
+    def __make_new_lines(self, raw_lines):
+        """
+        This function makes new lines.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+
+        raw_lines : list
+            raw lines
+
+        Returns
+        -------
+        new_lines : list
+            new lines
+        """
+        new_lines = []
+        new_line = None
+
+        for raw_line in raw_lines:
+            raw_line_bbox = raw_line["bbox"]
+            raw_line_spans = raw_line["spans"]
+            raw_line_text = "".join([span["text"] for span in raw_line_spans])
+            raw_line_dir = raw_line.get("dir", None)
+
+            decomposed_line_spans = []
+            for span in raw_line_spans:
+                raw_flags = span["flags"]
+                decomposed_flags = self.__span_flags_decomposer(raw_flags)
+                span["decomposed_flags"] = decomposed_flags
+                decomposed_line_spans.append(span)
+
+            if new_line is None:  # Handle the first line
+                new_line = {
+                    "bbox": raw_line_bbox,
+                    "text": raw_line_text,
+                    "dir": raw_line_dir if raw_line_dir else (0, 0),
+                    "spans": decomposed_line_spans,
+                }
+            else:  # Handle the rest lines
+                if (
+                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
+                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
+                ):
+                    new_line["bbox"] = (
+                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
+                        new_line["bbox"][1],  # top
+                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
+                        raw_line_bbox[3],  # bottom
+                    )
+                    new_line["text"] += raw_line_text
+                    new_line["spans"].extend(raw_line_spans)
+                    new_line["dir"] = (
+                        new_line["dir"][0] + raw_line_dir[0],
+                        new_line["dir"][1] + raw_line_dir[1],
+                    )
+                else:
+                    new_lines.append(new_line)
+                    new_line = {
+                        "bbox": raw_line_bbox,
+                        "text": raw_line_text,
+                        "dir": raw_line_dir if raw_line_dir else (0, 0),
+                        "spans": raw_line_spans,
+                    }
+        if new_line:
+            new_lines.append(new_line)
+
+        return new_lines
+
+    def __make_new_block(self, raw_block):
+        """
+        This function makes a new block.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        raw_block : dict
+            a raw block
+
+        Returns
+        -------
+        new_block : dict
+        """
+        new_block = {}
+
+        block_id = raw_block["number"]
+        block_bbox = raw_block["bbox"]
+        block_text = "".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
+        raw_lines = raw_block["lines"]
+        block_lines = self.__make_new_lines(raw_lines)
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["lines"] = block_lines
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "preproc_blocks" in blocks.keys():
+                    input_blocks = blocks["preproc_blocks"]
+                    for raw_block in input_blocks:
+                        new_block = self.__make_new_block(raw_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class BlockStatisticsCalculator:
+    """
+    This class calculates the statistics of the block.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __calc_stats_of_new_lines(self, new_lines):
+        """
+        This function calculates the paragraph metrics
+
+        Parameters
+        ----------
+        combined_lines : list
+            combined lines
+
+        Returns
+        -------
+        X0 : float
+            Median of x0 values, which represents the left average boundary of the block
+        X1 : float
+            Median of x1 values, which represents the right average boundary of the block
+        avg_char_width : float
+            Average of char widths, which represents the average char width of the block
+        avg_char_height : float
+            Average of line heights, which represents the average line height of the block
+
+        """
+        x0_values = []
+        x1_values = []
+        char_widths = []
+        char_heights = []
+
+        block_font_types = []
+        block_font_sizes = []
+        block_directions = []
+
+        if len(new_lines) > 0:
+            for i, line in enumerate(new_lines):
+                line_bbox = line["bbox"]
+                line_text = line["text"]
+                line_spans = line["spans"]
+
+                num_chars = len([ch for ch in line_text if not ch.isspace()])
+
+                x0_values.append(line_bbox[0])
+                x1_values.append(line_bbox[2])
+
+                if num_chars > 0:
+                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
+                    char_widths.append(char_width)
+
+                for span in line_spans:
+                    block_font_types.append(span["font"])
+                    block_font_sizes.append(span["size"])
+
+                if "dir" in line:
+                    block_directions.append(line["dir"])
+
+                # line_font_types = [span["font"] for span in line_spans]
+                char_heights = [span["size"] for span in line_spans]
+
+        X0 = np.median(x0_values) if x0_values else 0
+        X1 = np.median(x1_values) if x1_values else 0
+        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
+        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
+
+        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
+
+        max_span_length = 0
+        max_span_font_type = None
+        for line in new_lines:
+            line_spans = line["spans"]
+            for span in line_spans:
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_span_length:
+                    max_span_length = span_length
+                    max_span_font_type = span["font"]
+
+        max_freq_font_type = max_span_font_type
+
+        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
+
+        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
+        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
+
+        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
+
+        return (
+            X0,
+            X1,
+            avg_char_width,
+            avg_char_height,
+            max_freq_font_type,
+            avg_font_size,
+            (avg_dir_horizontal, avg_dir_vertical),
+            median_font_size,
+        )
+
+    def __make_new_block(self, input_block):
+        new_block = {}
+
+        raw_lines = input_block["lines"]
+        stats = self.__calc_stats_of_new_lines(raw_lines)
+
+        block_id = input_block["block_id"]
+        block_bbox = input_block["bbox"]
+        block_text = input_block["text"]
+        block_lines = raw_lines
+        block_avg_left_boundary = stats[0]
+        block_avg_right_boundary = stats[1]
+        block_avg_char_width = stats[2]
+        block_avg_char_height = stats[3]
+        block_font_type = stats[4]
+        block_font_size = stats[5]
+        block_direction = stats[6]
+        block_median_font_size = stats[7]
+
+        new_block["block_id"] = block_id
+        new_block["bbox"] = block_bbox
+        new_block["text"] = block_text
+        new_block["dir"] = block_direction
+        new_block["X0"] = block_avg_left_boundary
+        new_block["X1"] = block_avg_right_boundary
+        new_block["avg_char_width"] = block_avg_char_width
+        new_block["avg_char_height"] = block_avg_char_height
+        new_block["block_font_type"] = block_font_type
+        new_block["block_font_size"] = block_font_size
+        new_block["lines"] = block_lines
+        new_block["median_font_size"] = block_median_font_size
+
+        return new_block
+
+    def batch_process_blocks(self, pdf_dic):
+        """
+        This function processes the blocks in batch.
+
+        Parameters
+        ----------
+        self : object
+            The instance of the class.
+        ----------
+        blocks : list
+            Input block is a list of raw blocks.
+            Schema can refer to the value of key ""preproc_blocks".
+
+        Returns
+        -------
+        result_dict : dict
+            result dictionary
+        """
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self.__make_new_block(input_block)
+                        para_blocks.append(new_block)
+
+                blocks["para_blocks"] = para_blocks
+
+        return pdf_dic
+
+
+class DocStatisticsCalculator:
+    """
+    This class calculates the statistics of the document.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def calc_stats_of_doc(self, pdf_dict):
+        """
+        This function computes the statistics of the document
+
+        Parameters
+        ----------
+        result_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        statistics : dict
+            statistics of the document
+        """
+
+        total_text_length = 0
+        total_num_blocks = 0
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+                    for para_block in para_blocks:
+                        total_text_length += len(para_block["text"])
+                        total_num_blocks += 1
+
+        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
+
+        font_list = []
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    input_blocks = blocks["para_blocks"]
+                    for input_block in input_blocks:
+                        block_text_length = len(input_block.get("text", ""))
+                        if block_text_length < avg_text_length * 0.5:
+                            continue
+                        block_font_type = safe_get(input_block, "block_font_type", "")
+                        block_font_size = safe_get(input_block, "block_font_size", 0)
+                        font_list.append((block_font_type, block_font_size))
+
+        font_counter = Counter(font_list)
+        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
+        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
+
+        statistics = {
+            "num_pages": 0,
+            "num_blocks": 0,
+            "num_paras": 0,
+            "num_titles": 0,
+            "num_header_blocks": 0,
+            "num_footer_blocks": 0,
+            "num_watermark_blocks": 0,
+            "num_vertical_margin_note_blocks": 0,
+            "most_common_font_type": most_common_font[0][0],
+            "most_common_font_size": most_common_font[0][1],
+            "number_of_most_common_font": most_common_font[1],
+            "second_most_common_font_type": second_most_common_font[0][0],
+            "second_most_common_font_size": second_most_common_font[0][1],
+            "number_of_second_most_common_font": second_most_common_font[1],
+            "avg_text_length": avg_text_length,
+        }
+
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                blocks = pdf_dict[page_id]["para_blocks"]
+                statistics["num_pages"] += 1
+                for block_id, block_data in enumerate(blocks):
+                    statistics["num_blocks"] += 1
+
+                    if "paras" in block_data.keys():
+                        statistics["num_paras"] += len(block_data["paras"])
+
+                    for line in block_data["lines"]:
+                        if line.get("is_title", 0):
+                            statistics["num_titles"] += 1
+
+                    if block_data.get("is_header", 0):
+                        statistics["num_header_blocks"] += 1
+                    if block_data.get("is_footer", 0):
+                        statistics["num_footer_blocks"] += 1
+                    if block_data.get("is_watermark", 0):
+                        statistics["num_watermark_blocks"] += 1
+                    if block_data.get("is_vertical_margin_note", 0):
+                        statistics["num_vertical_margin_note_blocks"] += 1
+
+        pdf_dict["statistics"] = statistics
+
+        return pdf_dict
+
+
+class TitleProcessor:
+    """
+    This class processes the title.
+    """
+
+    def __init__(self, *doc_statistics) -> None:
+        if len(doc_statistics) > 0:
+            self.doc_statistics = doc_statistics[0]
+
+        self.nlp_model = NLPModels()
+        self.MAX_TITLE_LEVEL = 3
+        self.numbered_title_pattern = r"""
+            ^                                 # 行首
+            (                                 # 开始捕获组
+                [\(\（]\d+[\)\）]              # 括号内数字，支持中文和英文括号，例如：(1) 或 （1）
+                |\d+[\)\）]\s                  # 数字后跟右括号和空格，支持中文和英文括号，例如：2) 或 2）
+                |[\(\（][A-Z][\)\）]            # 括号内大写字母，支持中文和英文括号，例如：(A) 或 （A）
+                |[A-Z][\)\）]\s                # 大写字母后跟右括号和空格，例如：A) 或 A）
+                |[\(\（][IVXLCDM]+[\)\）]       # 括号内罗马数字，支持中文和英文括号，例如：(I) 或 （I）
+                |[IVXLCDM]+[\)\）]\s            # 罗马数字后跟右括号和空格，例如：I) 或 I）
+                |\d+(\.\d+)*\s                # 数字或复合数字编号后跟空格，例如：1. 或 3.2.1 
+                |[一二三四五六七八九十百千]+[、\s]       # 中文序号后跟顿号和空格，例如：一、
+                |[\（|\(][一二三四五六七八九十百千]+[\）|\)]\s*  # 中文括号内中文序号后跟空格，例如：（一）
+                |[A-Z]\.\d+(\.\d+)?\s         # 大写字母后跟点和数字，例如：A.1 或 A.1.1
+                |[\(\（][a-z][\)\）]            # 括号内小写字母，支持中文和英文括号，例如：(a) 或 （a）
+                |[a-z]\)\s                    # 小写字母后跟右括号和空格，例如：a) 
+                |[A-Z]-\s                     # 大写字母后跟短横线和空格，例如：A- 
+                |\w+:\s                       # 英文序号词后跟冒号和空格，例如：First: 
+                |第[一二三四五六七八九十百千]+[章节部分条款]\s # 以“第”开头的中文标题后跟空格
+                |[IVXLCDM]+\.                 # 罗马数字后跟点，例如：I.
+                |\d+\.\s                      # 单个数字后跟点和空格，例如：1. 
+            )                                 # 结束捕获组
+            .+                                # 标题的其余部分
+        """
+
+    def _is_potential_title(
+        self,
+        curr_line,
+        prev_line,
+        prev_line_is_title,
+        next_line,
+        avg_char_width,
+        avg_char_height,
+        median_font_size,
+    ):
+        """
+        This function checks if the line is a potential title.
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        avg_char_width : float
+            average of char widths
+        avg_char_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a potential title, False otherwise.
+        """
+
+        def __is_line_centered(line_bbox, page_bbox, avg_char_width):
+            """
+            This function checks if the line is centered on the page
+
+            Parameters
+            ----------
+            line_bbox : list
+                bbox of the line
+            page_bbox : list
+                bbox of the page
+            avg_char_width : float
+                average of char widths
+
+            Returns
+            -------
+            bool
+                True if the line is centered on the page, False otherwise.
+            """
+            horizontal_ratio = 0.5
+            horizontal_thres = horizontal_ratio * avg_char_width
+
+            x0, _, x1, _ = line_bbox
+            _, _, page_x1, _ = page_bbox
+
+            return abs((x0 + x1) / 2 - page_x1 / 2) < horizontal_thres
+
+        def __is_bold_font_line(line):
+            """
+            Check if a line contains any bold font style.
+            """
+
+            def _is_bold_span(span):
+                # if span text is empty or only contains space, return False
+                if not span["text"].strip():
+                    return False
+
+                return bool(span["flags"] & 2**4)  # Check if the font is bold
+
+            for span in line["spans"]:
+                if not _is_bold_span(span):
+                    return False
+
+            return True
+
+        def __is_italic_font_line(line):
+            """
+            Check if a line contains any italic font style.
+            """
+
+            def __is_italic_span(span):
+                return bool(span["flags"] & 2**1)  # Check if the font is italic
+
+            for span in line["spans"]:
+                if not __is_italic_span(span):
+                    return False
+
+            return True
+
+        def __is_punctuation_heavy(line_text):
+            """
+            Check if the line contains a high ratio of punctuation marks, which may indicate
+            that the line is not a title.
+
+            Parameters:
+            line_text (str): Text of the line.
+
+            Returns:
+            bool: True if the line is heavy with punctuation, False otherwise.
+            """
+            # Pattern for common title format like "X.Y. Title"
+            pattern = r"\b\d+\.\d+\..*\b"
+
+            # If the line matches the title format, return False
+            if re.match(pattern, line_text.strip()):
+                return False
+
+            # Find all punctuation marks in the line
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.1:
+                return True
+
+            return False
+
+        def __has_mixed_font_styles(spans, strict_mode=False):
+            """
+            This function checks if the line has mixed font styles, the strict mode will compare the font types
+
+            Parameters
+            ----------
+            spans : list
+                spans of the line
+            strict_mode : bool
+                True for strict mode, the font types will be fully compared
+                False for non-strict mode, the font types will be compared by the most longest common prefix
+
+            Returns
+            -------
+            bool
+                True if the line has mixed font styles, False otherwise.
+            """
+            if strict_mode:
+                font_styles = set()
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.add(font_style)
+
+                return len(font_styles) > 1
+
+            else:  # non-strict mode
+                font_styles = []
+                for span in spans:
+                    font_style = span["font"].lower()
+                    font_styles.append(font_style)
+
+                if len(font_styles) > 1:
+                    longest_common_prefix = os.path.commonprefix(font_styles)
+                    if len(longest_common_prefix) > 0:
+                        return False
+                    else:
+                        return True
+                else:
+                    return False
+
+        def __is_different_font_type_from_neighbors(curr_line_font_type, prev_line_font_type, next_line_font_type):
+            """
+            This function checks if the current line has a different font type from the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+            prev_line_font_type : str
+                font type of the previous line
+            next_line_font_type : str
+                font type of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a different font type from the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_type != other_font_type.lower()
+                for other_font_type in [prev_line_font_type, next_line_font_type]
+                if other_font_type is not None
+            )
+
+        def __is_larger_font_size_from_neighbors(curr_line_font_size, prev_line_font_size, next_line_font_size):
+            """
+            This function checks if the current line has a larger font size than the previous and next lines
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            prev_line_font_size : float
+                font size of the previous line
+            next_line_font_size : float
+                font size of the next line
+
+            Returns
+            -------
+            bool
+                True if the current line has a larger font size than the previous and next lines, False otherwise.
+            """
+            return all(
+                curr_line_font_size > other_font_size * 1.2
+                for other_font_size in [prev_line_font_size, next_line_font_size]
+                if other_font_size is not None
+            )
+
+        def __is_similar_to_pre_line(curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size):
+            """
+            This function checks if the current line is similar to the previous line
+
+            Parameters
+            ----------
+            curr_line : dict
+                current line
+            prev_line : dict
+                previous line
+
+            Returns
+            -------
+            bool
+                True if the current line is similar to the previous line, False otherwise.
+            """
+
+            if curr_line_font_type == prev_line_font_type and curr_line_font_size == prev_line_font_size:
+                return True
+            else:
+                return False
+
+        def __is_same_font_type_of_docAvg(curr_line_font_type):
+            """
+            This function checks if the current line has the same font type as the document average font type
+
+            Parameters
+            ----------
+            curr_line_font_type : str
+                font type of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line has the same font type as the document average font type, False otherwise.
+            """
+            doc_most_common_font_type = safe_get(self.doc_statistics, "most_common_font_type", "").lower()
+            doc_second_most_common_font_type = safe_get(self.doc_statistics, "second_most_common_font_type", "").lower()
+
+            return curr_line_font_type.lower() in [doc_most_common_font_type, doc_second_most_common_font_type]
+
+        def __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio: float = 1):
+            """
+            This function checks if the current line has a large enough font size
+
+            Parameters
+            ----------
+            curr_line_font_size : float
+                font size of the current line
+            ratio : float
+                ratio of the current line font size to the document average font size
+
+            Returns
+            -------
+            bool
+                True if the current line has a large enough font size, False otherwise.
+            """
+            doc_most_common_font_size = safe_get(self.doc_statistics, "most_common_font_size", 0)
+            doc_second_most_common_font_size = safe_get(self.doc_statistics, "second_most_common_font_size", 0)
+            doc_avg_font_size = min(doc_most_common_font_size, doc_second_most_common_font_size)
+
+            return curr_line_font_size >= doc_avg_font_size * ratio
+
+        def __is_sufficient_spacing_above_and_below(
+            curr_line_bbox,
+            prev_line_bbox,
+            next_line_bbox,
+            avg_char_height,
+            median_font_size,
+        ):
+            """
+            This function checks if the current line has sufficient spacing above and below
+
+            Parameters
+            ----------
+            curr_line_bbox : list
+                bbox of the current line
+            prev_line_bbox : list
+                bbox of the previous line
+            next_line_bbox : list
+                bbox of the next line
+            avg_char_width : float
+                average of char widths
+            avg_char_height : float
+                average of line heights
+
+            Returns
+            -------
+            bool
+                True if the current line has sufficient spacing above and below, False otherwise.
+            """
+            vertical_ratio = 1.25
+            vertical_thres = vertical_ratio * median_font_size
+
+            _, y0, _, y1 = curr_line_bbox
+
+            sufficient_spacing_above = False
+            if prev_line_bbox:
+                vertical_spacing_above = min(y0 - prev_line_bbox[1], y1 - prev_line_bbox[3])
+                sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            else:
+                sufficient_spacing_above = True
+
+            sufficient_spacing_below = False
+            if next_line_bbox:
+                vertical_spacing_below = min(next_line_bbox[1] - y0, next_line_bbox[3] - y1)
+                sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            else:
+                sufficient_spacing_below = True
+
+            return (sufficient_spacing_above, sufficient_spacing_below)
+
+        def __is_word_list_line_by_rules(curr_line_text):
+            """
+            This function checks if the current line is a word list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+            # name_list_pattern = r"([a-zA-Z][a-zA-Z\s]{0,20}[a-zA-Z]|[\u4e00-\u9fa5·]{2,16})(?=[，,;；\s]|$)"
+            name_list_pattern = r"(?<![\u4e00-\u9fa5])([A-Z][a-z]{0,19}\s[A-Z][a-z]{0,19}|[\u4e00-\u9fa5]{2,6})(?=[，,;；\s]|$)"
+
+            compiled_pattern = re.compile(name_list_pattern)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        def __get_text_catgr_by_nlp(curr_line_text):
+            """
+            This function checks if the current line is a name list using nlp model, such as spacy
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a name list, False otherwise.
+            """
+
+            result = self.nlp_model.detect_entity_catgr_using_nlp(curr_line_text)
+
+            return result
+
+        def __is_numbered_title(curr_line_text):
+            """
+            This function checks if the current line is a numbered list
+
+            Parameters
+            ----------
+            curr_line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a numbered list, False otherwise.
+            """
+
+            compiled_pattern = re.compile(self.numbered_title_pattern, re.VERBOSE)
+
+            if compiled_pattern.search(curr_line_text):
+                return True
+            else:
+                return False
+
+        def __is_end_with_ending_puncs(line_text):
+            """
+            This function checks if the current line ends with a ending punctuation mark
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line ends with a punctuation mark, False otherwise.
+            """
+            end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+
+            line_text = line_text.rstrip()
+            if line_text[-1] in end_puncs:
+                return True
+
+            return False
+
+        def __contains_only_no_meaning_symbols(line_text):
+            """
+            This function checks if the current line contains only symbols that have no meaning, if so, it is not a title.
+            Situation contains:
+            1. Only have punctuation marks
+            2. Only have other non-meaning symbols
+
+            Parameters
+            ----------
+            line_text : str
+                text of the current line
+
+            Returns
+            -------
+            bool
+                True if the current line contains only symbols that have no meaning, False otherwise.
+            """
+
+            punctuation_marks = re.findall(r"[^\w\s]", line_text)  # find all punctuation marks
+            number_of_punctuation_marks = len(punctuation_marks)
+
+            text_length = len(line_text)
+
+            if text_length == 0:
+                return False
+
+            punctuation_ratio = number_of_punctuation_marks / text_length
+            if punctuation_ratio >= 0.9:
+                return True
+
+            return False
+
+        def __is_equation(line_text):
+            """
+            This function checks if the current line is an equation.
+
+            Parameters
+            ----------
+            line_text : str
+
+            Returns
+            -------
+            bool
+                True if the current line is an equation, False otherwise.
+            """
+            equation_reg = r"\$.*?\\overline.*?\$"  # to match interline equations
+
+            if re.search(equation_reg, line_text):
+                return True
+            else:
+                return False
+
+        def __is_title_by_len(text, max_length=200):
+            """
+            This function checks if the current line is a title by length.
+
+            Parameters
+            ----------
+            text : str
+                text of the current line
+
+            max_length : int
+                max length of the title
+
+            Returns
+            -------
+            bool
+                True if the current line is a title, False otherwise.
+
+            """
+            text = text.strip()
+            return len(text) <= max_length
+
+        def __compute_line_font_type_and_size(curr_line):
+            """
+            This function computes the font type and font size of the line.
+
+            Parameters
+            ----------
+            line : dict
+                line
+
+            Returns
+            -------
+            font_type : str
+                font type of the line
+            font_size : float
+                font size of the line
+            """
+            spans = curr_line["spans"]
+            max_accumulated_length = 0
+            max_span_font_size = curr_line["spans"][0]["size"]  # default value, float type
+            max_span_font_type = curr_line["spans"][0]["font"].lower()  # default value, string type
+            for span in spans:
+                if span["text"].isspace():
+                    continue
+                span_length = span["bbox"][2] - span["bbox"][0]
+                if span_length > max_accumulated_length:
+                    max_accumulated_length = span_length
+                    max_span_font_size = span["size"]
+                    max_span_font_type = span["font"].lower()
+
+            return max_span_font_type, max_span_font_size
+
+        def __is_a_consistent_sub_title(pre_line, curr_line):
+            """
+            This function checks if the current line is a consistent sub title.
+
+            Parameters
+            ----------
+            pre_line : dict
+                previous line
+            curr_line : dict
+                current line
+
+            Returns
+            -------
+            bool
+                True if the current line is a consistent sub title, False otherwise.
+            """
+            if pre_line is None:
+                return False
+
+            start_letter_of_pre_line = pre_line["text"][0]
+            start_letter_of_curr_line = curr_line["text"][0]
+
+            has_same_prefix_digit = (
+                start_letter_of_pre_line.isdigit()
+                and start_letter_of_curr_line.isdigit()
+                and start_letter_of_pre_line == start_letter_of_curr_line
+            )
+
+            # prefix text of curr_line satisfies the following title format: x.x
+            prefix_text_pattern = r"^\d+\.\d+"
+            has_subtitle_format = re.match(prefix_text_pattern, curr_line["text"])
+
+            if has_same_prefix_digit or has_subtitle_format:
+                return True
+
+        """
+        Title detecting main Process.
+        """
+
+        """
+        Basic features about the current line.
+        """
+        curr_line_bbox = curr_line["bbox"]
+        curr_line_text = curr_line["text"]
+        curr_line_font_type, curr_line_font_size = __compute_line_font_type_and_size(curr_line)
+
+        if len(curr_line_text.strip()) == 0:  # skip empty lines
+            return False, False
+
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        if prev_line:
+            prev_line_font_type, prev_line_font_size = __compute_line_font_type_and_size(prev_line)
+        else:
+            prev_line_font_type, prev_line_font_size = None, None
+
+        next_line_bbox = next_line["bbox"] if next_line else None
+        if next_line:
+            next_line_font_type, next_line_font_size = __compute_line_font_type_and_size(next_line)
+        else:
+            next_line_font_type, next_line_font_size = None, None
+
+        """
+        Aggregated features about the current line.
+        """
+        is_italc_font = __is_italic_font_line(curr_line)
+        is_bold_font = __is_bold_font_line(curr_line)
+
+        is_font_size_little_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=0.8)
+        is_font_size_not_less_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1)
+        is_much_larger_font_than_doc_avg = __is_font_size_not_less_than_docAvg(curr_line_font_size, ratio=1.6)
+
+        is_not_same_font_type_of_docAvg = not __is_same_font_type_of_docAvg(curr_line_font_type)
+
+        is_potential_title_font = is_bold_font or is_font_size_not_less_than_doc_avg or is_not_same_font_type_of_docAvg
+
+        is_mix_font_styles_strict = __has_mixed_font_styles(curr_line["spans"], strict_mode=True)
+        is_mix_font_styles_loose = __has_mixed_font_styles(curr_line["spans"], strict_mode=False)
+
+        is_punctuation_heavy = __is_punctuation_heavy(curr_line_text)
+
+        is_word_list_line_by_rules = __is_word_list_line_by_rules(curr_line_text)
+        is_person_or_org_list_line_by_nlp = __get_text_catgr_by_nlp(curr_line_text) in ["PERSON", "GPE", "ORG"]
+
+        is_font_size_larger_than_neighbors = __is_larger_font_size_from_neighbors(
+            curr_line_font_size, prev_line_font_size, next_line_font_size
+        )
+
+        is_font_type_diff_from_neighbors = __is_different_font_type_from_neighbors(
+            curr_line_font_type, prev_line_font_type, next_line_font_type
+        )
+
+        has_sufficient_spaces_above, has_sufficient_spaces_below = __is_sufficient_spacing_above_and_below(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_height, median_font_size
+        )
+
+        is_similar_to_pre_line = __is_similar_to_pre_line(
+            curr_line_font_type, prev_line_font_type, curr_line_font_size, prev_line_font_size
+        )
+
+        is_consis_sub_title = __is_a_consistent_sub_title(prev_line, curr_line)
+
+        """
+        Further aggregated features about the current line.
+        
+        Attention:
+            Features that start with __ are for internal use.
+        """
+
+        __is_line_left_aligned_from_neighbors = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width
+        )
+        __is_font_diff_from_neighbors = is_font_size_larger_than_neighbors or is_font_type_diff_from_neighbors
+        is_a_left_inline_title = (
+            is_mix_font_styles_strict and __is_line_left_aligned_from_neighbors and __is_font_diff_from_neighbors
+        )
+
+        is_title_by_check_prev_line = prev_line is None and has_sufficient_spaces_above and is_potential_title_font
+        is_title_by_check_next_line = next_line is None and has_sufficient_spaces_below and is_potential_title_font
+
+        is_title_by_check_pre_and_next_line = (
+            (prev_line is not None or next_line is not None)
+            and has_sufficient_spaces_above
+            and has_sufficient_spaces_below
+            and is_potential_title_font
+        )
+
+        is_numbered_title = __is_numbered_title(curr_line_text) and (
+            (has_sufficient_spaces_above or prev_line is None) and (has_sufficient_spaces_below or next_line is None)
+        )
+
+        is_not_end_with_ending_puncs = not __is_end_with_ending_puncs(curr_line_text)
+
+        is_not_only_no_meaning_symbols = not __contains_only_no_meaning_symbols(curr_line_text)
+
+        is_equation = __is_equation(curr_line_text)
+
+        is_title_by_len = __is_title_by_len(curr_line_text)
+
+        """
+        Decide if the line is a title.
+        """
+
+        is_title = (
+            is_not_end_with_ending_puncs  # not end with ending punctuation marks
+            and is_not_only_no_meaning_symbols  # not only have no meaning symbols
+            and is_title_by_len  # is a title by length, default max length is 200
+            and not is_equation  # an interline equation should never be a title
+            and is_potential_title_font  # is a potential title font, which is bold or larger than the document average font size or not the same font type as the document average font type
+            and (
+                (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                or (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                or (
+                    is_much_larger_font_than_doc_avg
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_font_size_little_less_than_doc_avg
+                    and is_bold_font
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+            )  # Consider the following situations: bold font, much larger font than doc avg, not same font type as doc avg, sufficient spacing above and below
+            and (
+                (
+                    not is_person_or_org_list_line_by_nlp
+                    and (
+                        is_much_larger_font_than_doc_avg
+                        or (is_not_same_font_type_of_docAvg and is_font_size_not_less_than_doc_avg)
+                    )
+                )
+                or (
+                    not (is_word_list_line_by_rules and is_person_or_org_list_line_by_nlp)
+                    and not is_a_left_inline_title
+                    and not is_punctuation_heavy
+                    and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+                )
+                or (
+                    is_person_or_org_list_line_by_nlp
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                    and (is_bold_font and is_much_larger_font_than_doc_avg and is_not_same_font_type_of_docAvg)
+                )
+                or (is_numbered_title and not is_a_left_inline_title)
+            )  # Exclude the following situations: person/org list
+        )
+        # ) or (prev_line_is_title and is_consis_sub_title)
+
+        is_name_or_org_list_to_be_removed = (
+            (is_person_or_org_list_line_by_nlp)
+            and is_punctuation_heavy
+            and (is_title_by_check_prev_line or is_title_by_check_next_line or is_title_by_check_pre_and_next_line)
+        ) and not is_title
+
+        if is_name_or_org_list_to_be_removed:
+            is_author_or_org_list = True
+        else:
+            is_author_or_org_list = False
+
+        # return is_title, is_author_or_org_list
+
+        """
+        # print reason why the line is a title
+        if is_title:
+            print_green("This line is a title.")
+            print_green("↓" * 10)
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+        # print reason why the line is not a title
+        line_text = curr_line_text.strip()
+        test_text = "Career/Personal Life"
+        text_content_condition = line_text == test_text
+        
+        if not is_title and text_content_condition: # Print specific line
+        # if not is_title: # Print each line
+            print_red("This line is not a title.")
+            print_red("↓" * 10)
+
+            print()
+            print("curr_line_text: ", curr_line_text)
+            print()
+
+            if is_not_end_with_ending_puncs:
+                print_green(f"is_not_end_with_ending_puncs")
+            else:
+                print_red(f"is_end_with_ending_puncs")
+
+            if is_not_only_no_meaning_symbols:
+                print_green(f"is_not_only_no_meaning_symbols")
+            else:
+                print_red(f"is_only_no_meaning_symbols")
+
+            if is_title_by_len:
+                print_green(f"is_title_by_len: {is_title_by_len}")
+            else:
+                print_red(f"is_not_title_by_len: {is_title_by_len}")
+
+            if is_equation:
+                print_red(f"is_equation")
+            else:
+                print_green(f"is_not_equation")
+
+            if is_potential_title_font:
+                print_green(f"is_potential_title_font")
+            else:
+                print_red(f"is_not_potential_title_font")
+
+            if is_punctuation_heavy:
+                print_red("is_punctuation_heavy")
+            else:
+                print_green("is_not_punctuation_heavy")
+
+            if is_bold_font:
+                print_green(f"is_bold_font")
+            else:
+                print_red(f"is_not_bold_font")
+
+            if is_font_size_not_less_than_doc_avg:
+                print_green(f"is_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_larger_font_than_doc_avg")
+
+            if is_much_larger_font_than_doc_avg:
+                print_green(f"is_much_larger_font_than_doc_avg")
+            else:
+                print_red(f"is_not_much_larger_font_than_doc_avg")
+
+            if is_not_same_font_type_of_docAvg:
+                print_green(f"is_not_same_font_type_of_docAvg")
+            else:
+                print_red(f"is_same_font_type_of_docAvg")
+
+            if is_word_list_line_by_rules:
+                print_red("is_word_list_line_by_rules")
+            else:
+                print_green("is_not_name_list_by_rules")
+
+            if is_person_or_org_list_line_by_nlp:
+                print_red("is_person_or_org_list_line_by_nlp")
+            else:
+                print_green("is_not_person_or_org_list_line_by_nlp")
+
+            if not is_numbered_title:
+                print_red("is_not_numbered_title")
+            else:
+                print_green("is_numbered_title")
+
+            if is_a_left_inline_title:
+                print_red("is_a_left_inline_title")
+            else:
+                print_green("is_not_a_left_inline_title")
+
+            if not is_title_by_check_prev_line:
+                print_red("is_not_title_by_check_prev_line")
+            else:
+                print_green("is_title_by_check_prev_line")
+
+            if not is_title_by_check_next_line:
+                print_red("is_not_title_by_check_next_line")
+            else:
+                print_green("is_title_by_check_next_line")
+
+            if not is_title_by_check_pre_and_next_line:
+                print_red("is_not_title_by_check_pre_and_next_line")
+            else:
+                print_green("is_title_by_check_pre_and_next_line")
+
+        # print_green("Common features:")
+        # print_green("↓" * 10)
+
+        # print(f"    curr_line_font_type: {curr_line_font_type}")
+        # print(f"    curr_line_font_size: {curr_line_font_size}")
+        # print()
+
+        """
+
+        return is_title, is_author_or_org_list
+
+    def _detect_title(self, input_block):
+        """
+        Use the functions 'is_potential_title' to detect titles of each paragraph block.
+        If a line is a title, then the value of key 'is_title' of the line will be set to True.
+        """
+
+        raw_lines = input_block["lines"]
+
+        prev_line_is_title_flag = False
+
+        for i, curr_line in enumerate(raw_lines):
+            prev_line = raw_lines[i - 1] if i > 0 else None
+            next_line = raw_lines[i + 1] if i < len(raw_lines) - 1 else None
+
+            blk_avg_char_width = input_block["avg_char_width"]
+            blk_avg_char_height = input_block["avg_char_height"]
+            blk_media_font_size = input_block["median_font_size"]
+
+            is_title, is_author_or_org_list = self._is_potential_title(
+                curr_line,
+                prev_line,
+                prev_line_is_title_flag,
+                next_line,
+                blk_avg_char_width,
+                blk_avg_char_height,
+                blk_media_font_size,
+            )
+
+            if is_title:
+                curr_line["is_title"] = is_title
+                prev_line_is_title_flag = True
+            else:
+                curr_line["is_title"] = False
+                prev_line_is_title_flag = False
+
+            # print(f"curr_line['text']: {curr_line['text']}")
+            # print(f"curr_line['is_title']: {curr_line['is_title']}")
+            # print(f"prev_line['text']: {prev_line['text'] if prev_line else None}")
+            # print(f"prev_line_is_title_flag: {prev_line_is_title_flag}")
+            # print()
+
+            if is_author_or_org_list:
+                curr_line["is_author_or_org_list"] = is_author_or_org_list
+            else:
+                curr_line["is_author_or_org_list"] = False
+
+        return input_block
+
+    def batch_detect_titles(self, pdf_dic):
+        """
+        This function batch process the blocks to detect titles.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        num_titles = 0
+
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in blocks.keys():
+                    para_blocks = blocks["para_blocks"]
+
+                    all_single_line_blocks = []
+                    for block in para_blocks:
+                        if len(block["lines"]) == 1:
+                            all_single_line_blocks.append(block)
+
+                    new_para_blocks = []
+                    if not len(all_single_line_blocks) == len(para_blocks):  # Not all blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_block = self._detect_title(para_block)
+                            new_para_blocks.append(new_block)
+                            num_titles += sum([line.get("is_title", 0) for line in new_block["lines"]])
+                    else:  # All blocks are single line blocks.
+                        for para_block in para_blocks:
+                            new_para_blocks.append(para_block)
+                            num_titles += sum([line.get("is_title", 0) for line in para_block["lines"]])
+                    para_blocks = new_para_blocks
+
+                blocks["para_blocks"] = para_blocks
+
+                for para_block in para_blocks:
+                    all_titles = all(safe_get(line, "is_title", False) for line in para_block["lines"])
+                    para_text_len = sum([len(line["text"]) for line in para_block["lines"]])
+                    if (
+                        all_titles and para_text_len < 200
+                    ):  # total length of the paragraph is less than 200, more than this should not be a title
+                        para_block["is_block_title"] = 1
+                    else:
+                        para_block["is_block_title"] = 0
+
+                    all_name_or_org_list_to_be_removed = all(
+                        safe_get(line, "is_author_or_org_list", False) for line in para_block["lines"]
+                    )
+                    if all_name_or_org_list_to_be_removed and page_id == "page_0":
+                        para_block["is_block_an_author_or_org_list"] = 1
+                    else:
+                        para_block["is_block_an_author_or_org_list"] = 0
+
+        pdf_dic["statistics"]["num_titles"] = num_titles
+
+        return pdf_dic
+
+    def _recog_title_level(self, title_blocks):
+        """
+        This function determines the title level based on the font size of the title.
+
+        Parameters
+        ----------
+        title_blocks : list
+
+        Returns
+        -------
+        title_blocks : list
+        """
+
+        font_sizes = np.array([safe_get(tb["block"], "block_font_size", 0) for tb in title_blocks])
+
+        # Use the mean and std of font sizes to remove extreme values
+        mean_font_size = np.mean(font_sizes)
+        std_font_size = np.std(font_sizes)
+        min_extreme_font_size = mean_font_size - std_font_size  # type: ignore
+        max_extreme_font_size = mean_font_size + std_font_size  # type: ignore
+
+        # Compute the threshold for title level
+        middle_font_sizes = font_sizes[(font_sizes > min_extreme_font_size) & (font_sizes < max_extreme_font_size)]
+        if middle_font_sizes.size > 0:
+            middle_mean_font_size = np.mean(middle_font_sizes)
+            level_threshold = middle_mean_font_size
+        else:
+            level_threshold = mean_font_size
+
+        for tb in title_blocks:
+            title_block = tb["block"]
+            title_font_size = safe_get(title_block, "block_font_size", 0)
+
+            current_level = 1  # Initialize title level, the biggest level is 1
+
+            # print(f"Before adjustment by font size, {current_level}")
+            if title_font_size >= max_extreme_font_size:
+                current_level = 1
+            elif title_font_size <= min_extreme_font_size:
+                current_level = 3
+            elif float(title_font_size) >= float(level_threshold):
+                current_level = 2
+            else:
+                current_level = 3
+            # print(f"After adjustment by font size, {current_level}")
+
+            title_block["block_title_level"] = current_level
+
+        return title_blocks
+
+    def batch_recog_title_level(self, pdf_dic):
+        """
+        This function batch process the blocks to recognize title level.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            result dictionary
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+        title_blocks = []
+
+        # Collect all titles
+        for page_id, blocks in pdf_dic.items():
+            if page_id.startswith("page_"):
+                para_blocks = blocks.get("para_blocks", [])
+                for block in para_blocks:
+                    if block.get("is_block_title"):
+                        title_obj = {"page_id": page_id, "block": block}
+                        title_blocks.append(title_obj)
+
+        # Determine title level
+        if title_blocks:
+            # Determine title level based on font size
+            title_blocks = self._recog_title_level(title_blocks)
+
+        return pdf_dic
+
+
+class BlockTerminationProcessor:
+    """
+    This class is used to process the block termination.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def _is_consistent_lines(
+        self,
+        curr_line,
+        prev_line,
+        next_line,
+        consistent_direction,  # 0 for prev, 1 for next, 2 for both
+    ):
+        """
+        This function checks if the line is consistent with its neighbors
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        consistent_direction : int
+            0 for prev, 1 for next, 2 for both
+
+        Returns
+        -------
+        bool
+            True if the line is consistent with its neighbors, False otherwise.
+        """
+
+        curr_line_font_size = curr_line["spans"][0]["size"]
+        curr_line_font_type = curr_line["spans"][0]["font"].lower()
+
+        if consistent_direction == 0:
+            if prev_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                return curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 1:
+            if next_line:
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+            else:
+                return False
+
+        elif consistent_direction == 2:
+            if prev_line and next_line:
+                prev_line_font_size = prev_line["spans"][0]["size"]
+                prev_line_font_type = prev_line["spans"][0]["font"].lower()
+                next_line_font_size = next_line["spans"][0]["size"]
+                next_line_font_type = next_line["spans"][0]["font"].lower()
+                return (curr_line_font_size == prev_line_font_size and curr_line_font_type == prev_line_font_type) and (
+                    curr_line_font_size == next_line_font_size and curr_line_font_type == next_line_font_type
+                )
+            else:
+                return False
+
+        else:
+            return False
+
+    def _is_regular_line(self, curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_line_height):
+        """
+        This function checks if the line is a regular line
+
+        Parameters
+        ----------
+        curr_line_bbox : list
+            bbox of the current line
+        prev_line_bbox : list
+            bbox of the previous line
+        next_line_bbox : list
+            bbox of the next line
+        avg_char_width : float
+            average of char widths
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a regular line, False otherwise.
+        """
+        horizontal_ratio = 0.5
+        vertical_ratio = 0.5
+        horizontal_thres = horizontal_ratio * avg_char_width
+        vertical_thres = vertical_ratio * avg_line_height
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        x0_near_X0 = abs(x0 - X0) < horizontal_thres
+        x1_near_X1 = abs(x1 - X1) < horizontal_thres
+
+        prev_line_is_end_of_para = prev_line_bbox and (abs(prev_line_bbox[2] - X1) > avg_char_width)
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+
+        return (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (not x0_near_X0 and not x1_near_X1)
+            or prev_line_is_end_of_para
+        )
+
+    def _is_possible_start_of_para(self, curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size):
+        """
+        This function checks if the line is a possible start of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        prev_line : dict
+            previous line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+        avg_line_height : float
+            average of line heights
+
+        Returns
+        -------
+        bool
+            True if the line is a possible start of a paragraph, False otherwise.
+        """
+        start_confidence = 0.5  # Initial confidence of the line being a start of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        prev_line_bbox = prev_line["bbox"] if prev_line else None
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        indent_ratio = 1
+
+        vertical_ratio = 1.5
+        vertical_thres = vertical_ratio * avg_font_size
+
+        left_horizontal_ratio = 0.5
+        left_horizontal_thres = left_horizontal_ratio * avg_char_width
+
+        right_horizontal_ratio = 2.5
+        right_horizontal_thres = right_horizontal_ratio * avg_char_width
+
+        x0, y0, x1, y1 = curr_line_bbox
+
+        indent_condition = x0 > X0 + indent_ratio * avg_char_width
+        if indent_condition:
+            start_confidence += 0.2
+            decision_path.append("indent_condition_met")
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_thres
+        if x0_near_X0:
+            start_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_near_X1 = abs(x1 - X1) < right_horizontal_thres
+        if x1_near_X1:
+            start_confidence += 0.1
+            decision_path.append("x1_near_X1")
+
+        if prev_line is None:
+            prev_line_is_end_of_para = True
+            start_confidence += 0.2
+            decision_path.append("no_prev_line")
+        else:
+            prev_line_is_end_of_para, _, _ = self._is_possible_end_of_para(prev_line, next_line, X0, X1, avg_char_width)
+            if prev_line_is_end_of_para:
+                start_confidence += 0.1
+                decision_path.append("prev_line_is_end_of_para")
+
+        sufficient_spacing_above = False
+        if prev_line_bbox:
+            vertical_spacing_above = y1 - prev_line_bbox[3]
+            sufficient_spacing_above = vertical_spacing_above > vertical_thres
+            if sufficient_spacing_above:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_above")
+
+        sufficient_spacing_below = False
+        if next_line_bbox:
+            vertical_spacing_below = next_line_bbox[1] - y0
+            sufficient_spacing_below = vertical_spacing_below > vertical_thres
+            if sufficient_spacing_below:
+                start_confidence += 0.2
+                decision_path.append("sufficient_spacing_below")
+
+        is_regular_line = self._is_regular_line(
+            curr_line_bbox, prev_line_bbox, next_line_bbox, avg_char_width, X0, X1, avg_font_size
+        )
+        if is_regular_line:
+            start_confidence += 0.1
+            decision_path.append("is_regular_line")
+
+        is_start_of_para = (
+            (sufficient_spacing_above or sufficient_spacing_below)
+            or (indent_condition)
+            or (not indent_condition and x0_near_X0 and x1_near_X1 and not is_regular_line)
+            or prev_line_is_end_of_para
+        )
+        return (is_start_of_para, start_confidence, decision_path)
+
+    def _is_possible_end_of_para(self, curr_line, next_line, X0, X1, avg_char_width):
+        """
+        This function checks if the line is a possible end of a paragraph
+
+        Parameters
+        ----------
+        curr_line : dict
+            current line
+        next_line : dict
+            next line
+        X0 : float
+            median of x0 values, which represents the left average boundary of the page
+        X1 : float
+            median of x1 values, which represents the right average boundary of the page
+        avg_char_width : float
+            average of char widths
+
+        Returns
+        -------
+        bool
+            True if the line is a possible end of a paragraph, False otherwise.
+        """
+
+        end_confidence = 0.5  # Initial confidence of the line being a end of a paragraph
+        decision_path = []  # Record the decision path
+
+        curr_line_bbox = curr_line["bbox"]
+        next_line_bbox = next_line["bbox"] if next_line else None
+
+        left_horizontal_ratio = 0.5
+        right_horizontal_ratio = 0.5
+
+        x0, _, x1, y1 = curr_line_bbox
+        next_x0, next_y0, _, _ = next_line_bbox if next_line_bbox else (0, 0, 0, 0)
+
+        x0_near_X0 = abs(x0 - X0) < left_horizontal_ratio * avg_char_width
+        if x0_near_X0:
+            end_confidence += 0.1
+            decision_path.append("x0_near_X0")
+
+        x1_smaller_than_X1 = x1 < X1 - right_horizontal_ratio * avg_char_width
+        if x1_smaller_than_X1:
+            end_confidence += 0.1
+            decision_path.append("x1_smaller_than_X1")
+
+        next_line_is_start_of_para = (
+            next_line_bbox
+            and (next_x0 > X0 + left_horizontal_ratio * avg_char_width)
+            and (not is_line_left_aligned_from_neighbors(curr_line_bbox, None, next_line_bbox, avg_char_width, direction=1))
+        )
+        if next_line_is_start_of_para:
+            end_confidence += 0.2
+            decision_path.append("next_line_is_start_of_para")
+
+        is_line_left_aligned_from_neighbors_bool = is_line_left_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if is_line_left_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_left_aligned_from_neighbors")
+
+        is_line_right_aligned_from_neighbors_bool = is_line_right_aligned_from_neighbors(
+            curr_line_bbox, None, next_line_bbox, avg_char_width
+        )
+        if not is_line_right_aligned_from_neighbors_bool:
+            end_confidence += 0.1
+            decision_path.append("line_is_not_right_aligned_from_neighbors")
+
+        is_end_of_para = end_with_punctuation(curr_line["text"]) and (
+            (x0_near_X0 and x1_smaller_than_X1)
+            or (is_line_left_aligned_from_neighbors_bool and not is_line_right_aligned_from_neighbors_bool)
+        )
+
+        return (is_end_of_para, end_confidence, decision_path)
+
+    def _cut_paras_per_block(
+        self,
+        block,
+    ):
+        """
+        Processes a raw block from PyMuPDF and returns the processed block.
+
+        Parameters
+        ----------
+        raw_block : dict
+            A raw block from pymupdf.
+
+        Returns
+        -------
+        processed_block : dict
+
+        """
+
+        def _construct_para(lines, is_block_title, para_title_level):
+            """
+            Construct a paragraph from given lines.
+            """
+
+            font_sizes = [span["size"] for line in lines for span in line["spans"]]
+            avg_font_size = sum(font_sizes) / len(font_sizes) if font_sizes else 0
+
+            font_colors = [span["color"] for line in lines for span in line["spans"]]
+            most_common_font_color = max(set(font_colors), key=font_colors.count) if font_colors else None
+
+            font_type_lengths = {}
+            for line in lines:
+                for span in line["spans"]:
+                    font_type = span["font"]
+                    bbox_width = span["bbox"][2] - span["bbox"][0]
+                    if font_type in font_type_lengths:
+                        font_type_lengths[font_type] += bbox_width
+                    else:
+                        font_type_lengths[font_type] = bbox_width
+
+            # get the font type with the longest bbox width
+            most_common_font_type = max(font_type_lengths, key=font_type_lengths.get) if font_type_lengths else None  # type: ignore
+
+            para_bbox = calculate_para_bbox(lines)
+            para_text = " ".join(line["text"] for line in lines)
+
+            return {
+                "para_bbox": para_bbox,
+                "para_text": para_text,
+                "para_font_type": most_common_font_type,
+                "para_font_size": avg_font_size,
+                "para_font_color": most_common_font_color,
+                "is_para_title": is_block_title,
+                "para_title_level": para_title_level,
+            }
+
+        block_bbox = block["bbox"]
+        block_text = block["text"]
+        block_lines = block["lines"]
+
+        X0 = safe_get(block, "X0", 0)
+        X1 = safe_get(block, "X1", 0)
+        avg_char_width = safe_get(block, "avg_char_width", 0)
+        avg_char_height = safe_get(block, "avg_char_height", 0)
+        avg_font_size = safe_get(block, "avg_font_size", 0)
+
+        is_block_title = safe_get(block, "is_block_title", False)
+        para_title_level = safe_get(block, "block_title_level", 0)
+
+        # Segment into paragraphs
+        para_ranges = []
+        in_paragraph = False
+        start_idx_of_para = None
+
+        # Create the processed paragraphs
+        processed_paras = {}
+        para_bboxes = []
+        end_idx_of_para = 0
+
+        for line_index, line in enumerate(block_lines):
+            curr_line = line
+            prev_line = block_lines[line_index - 1] if line_index > 0 else None
+            next_line = block_lines[line_index + 1] if line_index < len(block_lines) - 1 else None
+
+            """
+            Start processing paragraphs.
+            """
+
+            # Check if the line is the start of a paragraph
+            is_start_of_para, start_confidence, decision_path = self._is_possible_start_of_para(
+                curr_line, prev_line, next_line, X0, X1, avg_char_width, avg_font_size
+            )
+            if not in_paragraph and is_start_of_para:
+                in_paragraph = True
+                start_idx_of_para = line_index
+
+                # print_green(">>> Start of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    start_confidence: ", start_confidence)
+                # print("    decision_path: ", decision_path)
+
+            # Check if the line is the end of a paragraph
+            is_end_of_para, end_confidence, decision_path = self._is_possible_end_of_para(
+                curr_line, next_line, X0, X1, avg_char_width
+            )
+            if in_paragraph and (is_end_of_para or not next_line):
+                para_ranges.append((start_idx_of_para, line_index))
+                start_idx_of_para = None
+                in_paragraph = False
+
+                # print_red(">>> End of a paragraph")
+                # print("    curr_line_text: ", curr_line["text"])
+                # print("    end_confidence: ", end_confidence)
+                # print("    decision_path: ", decision_path)
+
+        # Add the last paragraph if it is not added
+        if in_paragraph and start_idx_of_para is not None:
+            para_ranges.append((start_idx_of_para, len(block_lines) - 1))
+
+        # Process the matched paragraphs
+        for para_index, (start_idx, end_idx) in enumerate(para_ranges):
+            matched_lines = block_lines[start_idx : end_idx + 1]
+            para_properties = _construct_para(matched_lines, is_block_title, para_title_level)
+            para_key = f"para_{len(processed_paras)}"
+            processed_paras[para_key] = para_properties
+            para_bboxes.append(para_properties["para_bbox"])
+            end_idx_of_para = end_idx + 1
+
+        # Deal with the remaining lines
+        if end_idx_of_para < len(block_lines):
+            unmatched_lines = block_lines[end_idx_of_para:]
+            unmatched_properties = _construct_para(unmatched_lines, is_block_title, para_title_level)
+            unmatched_key = f"para_{len(processed_paras)}"
+            processed_paras[unmatched_key] = unmatched_properties
+            para_bboxes.append(unmatched_properties["para_bbox"])
+
+        block["paras"] = processed_paras
+
+        return block
+
+    def batch_process_blocks(self, pdf_dict):
+        """
+        Parses the blocks of all pages.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+        filter_blocks : list
+            List of bounding boxes to filter.
+
+        Returns
+        -------
+        result_dict : dict
+            Result dictionary.
+
+        """
+
+        num_paras = 0
+
+        for page_id, page in pdf_dict.items():
+            if page_id.startswith("page_"):
+                para_blocks = []
+                if "para_blocks" in page.keys():
+                    input_blocks = page["para_blocks"]
+                    for input_block in input_blocks:
+                        new_block = self._cut_paras_per_block(input_block)
+                        para_blocks.append(new_block)
+                        num_paras += len(new_block["paras"])
+
+                page["para_blocks"] = para_blocks
+
+        pdf_dict["statistics"]["num_paras"] = num_paras
+        return pdf_dict
+
+
+class BlockContinuationProcessor:
+    """
+    This class is used to process the blocks to detect block continuations.
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_similar_font_type(self, font_type_1, font_type_2, prefix_length_ratio=0.3):
+        """
+        This function checks if the two font types are similar.
+        Definition of similar font types: the two font types have a common prefix,
+        and the length of the common prefix is at least a certain ratio of the length of the shorter font type.
+
+        Parameters
+        ----------
+        font_type1 : str
+            font type 1
+        font_type2 : str
+            font type 2
+        prefix_length_ratio : float
+            minimum ratio of the common prefix length to the length of the shorter font type
+
+        Returns
+        -------
+        bool
+            True if the two font types are similar, False otherwise.
+        """
+
+        if isinstance(font_type_1, list):
+            font_type_1 = font_type_1[0] if font_type_1 else ""
+        if isinstance(font_type_2, list):
+            font_type_2 = font_type_2[0] if font_type_2 else ""
+
+        if font_type_1 == font_type_2:
+            return True
+
+        # Find the length of the common prefix
+        common_prefix_length = len(os.path.commonprefix([font_type_1, font_type_2]))
+
+        # Calculate the minimum prefix length based on the ratio
+        min_prefix_length = int(min(len(font_type_1), len(font_type_2)) * prefix_length_ratio)
+
+        return common_prefix_length >= min_prefix_length
+
+    def __is_same_block_font(self, block_1, block_2):
+        """
+        This function compares the font of block1 and block2
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 have the same font, else False
+        """
+        block_1_font_type = safe_get(block_1, "block_font_type", "")
+        block_1_font_size = safe_get(block_1, "block_font_size", 0)
+        block_1_avg_char_width = safe_get(block_1, "avg_char_width", 0)
+
+        block_2_font_type = safe_get(block_2, "block_font_type", "")
+        block_2_font_size = safe_get(block_2, "block_font_size", 0)
+        block_2_avg_char_width = safe_get(block_2, "avg_char_width", 0)
+
+        if isinstance(block_1_font_size, list):
+            block_1_font_size = block_1_font_size[0] if block_1_font_size else 0
+        if isinstance(block_2_font_size, list):
+            block_2_font_size = block_2_font_size[0] if block_2_font_size else 0
+
+        block_1_text = safe_get(block_1, "text", "")
+        block_2_text = safe_get(block_2, "text", "")
+
+        if block_1_avg_char_width == 0 or block_2_avg_char_width == 0:
+            return False
+
+        if not block_1_text or not block_2_text:
+            return False
+        else:
+            text_len_ratio = len(block_2_text) / len(block_1_text)
+            if text_len_ratio < 0.2:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.5
+                )
+            else:
+                avg_char_width_condition = (
+                    abs(block_1_avg_char_width - block_2_avg_char_width) / min(block_1_avg_char_width, block_2_avg_char_width)
+                    < 0.2
+                )
+
+        block_font_size_condition = abs(block_1_font_size - block_2_font_size) < 1
+
+        return (
+            self.__is_similar_font_type(block_1_font_type, block_2_font_type)
+            and avg_char_width_condition
+            and block_font_size_condition
+        )
+
+    def _is_alphabet_char(self, char):
+        if (char >= "\u0041" and char <= "\u005a") or (char >= "\u0061" and char <= "\u007a"):
+            return True
+        else:
+            return False
+
+    def _is_chinese_char(self, char):
+        if char >= "\u4e00" and char <= "\u9fa5":
+            return True
+        else:
+            return False
+
+    def _is_other_letter_char(self, char):
+        try:
+            cat = unicodedata.category(char)
+            if cat == "Lu" or cat == "Ll":
+                return not self._is_alphabet_char(char) and not self._is_chinese_char(char)
+        except TypeError:
+            print("The input to the function must be a single character.")
+        return False
+
+    def _is_year(self, s: str):
+        try:
+            number = int(s)
+            return 1900 <= number <= 2099
+        except ValueError:
+            return False
+
+    def _match_brackets(self, text):
+        # pattern = r"^[\(\)\[\]（）【】{}｛｝<>＜＞〔〕〘〙\"\'“”‘’]"
+        pattern = r"^[\(\)\]（）】{}｛｝>＞〕〙\"\'“”‘’]"
+        return bool(re.match(pattern, text))
+
+    def _is_para_font_consistent(self, para_1, para_2):
+        """
+        This function compares the font of para1 and para2
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 have the same font, else False
+        """
+        if para_1 is None or para_2 is None:
+            return False
+
+        para_1_font_type = safe_get(para_1, "para_font_type", "")
+        para_1_font_size = safe_get(para_1, "para_font_size", 0)
+        para_1_font_color = safe_get(para_1, "para_font_color", "")
+
+        para_2_font_type = safe_get(para_2, "para_font_type", "")
+        para_2_font_size = safe_get(para_2, "para_font_size", 0)
+        para_2_font_color = safe_get(para_2, "para_font_color", "")
+
+        if isinstance(para_1_font_type, list):  # get the most common font type
+            para_1_font_type = max(set(para_1_font_type), key=para_1_font_type.count)
+        if isinstance(para_2_font_type, list):
+            para_2_font_type = max(set(para_2_font_type), key=para_2_font_type.count)
+        if isinstance(para_1_font_size, list):  # compute average font type
+            para_1_font_size = sum(para_1_font_size) / len(para_1_font_size)
+        if isinstance(para_2_font_size, list):  # compute average font type
+            para_2_font_size = sum(para_2_font_size) / len(para_2_font_size)
+
+        return (
+            self.__is_similar_font_type(para_1_font_type, para_2_font_type)
+            and abs(para_1_font_size - para_2_font_size) < 1.5
+            # and para_font_color1 == para_font_color2
+        )
+
+    def _is_para_puncs_consistent(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph by checking the puncs of para1(former) and para2(latter)
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph by using the puncs, else False
+        """
+        para_1_text = safe_get(para_1, "para_text", "").strip()
+        para_2_text = safe_get(para_2, "para_text", "").strip()
+
+        para_1_bboxes = safe_get(para_1, "para_bbox", [])
+        para_1_font_sizes = safe_get(para_1, "para_font_size", 0)
+
+        para_2_bboxes = safe_get(para_2, "para_bbox", [])
+        para_2_font_sizes = safe_get(para_2, "para_font_size", 0)
+
+        # print_yellow("    Features of determine puncs_consistent:")
+        # print(f"    para_1_text: {para_1_text}")
+        # print(f"    para_2_text: {para_2_text}")
+        # print(f"    para_1_bboxes: {para_1_bboxes}")
+        # print(f"    para_2_bboxes: {para_2_bboxes}")
+        # print(f"    para_1_font_sizes: {para_1_font_sizes}")
+        # print(f"    para_2_font_sizes: {para_2_font_sizes}")
+
+        if is_nested_list(para_1_bboxes):
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes[-1]
+        else:
+            x0_1, y0_1, x1_1, y1_1 = para_1_bboxes
+
+        if is_nested_list(para_2_bboxes):
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes[0]
+            para_2_font_sizes = para_2_font_sizes[0]  # type: ignore
+        else:
+            x0_2, y0_2, x1_2, y1_2 = para_2_bboxes
+
+        right_align_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        are_two_paras_right_aligned = abs(x1_1 - x1_2) < right_align_threshold
+
+        left_indent_threshold = 0.5 * (para_1_font_sizes + para_2_font_sizes) * 0.8
+        is_para1_left_indent_than_papa2 = x0_1 - x0_2 > left_indent_threshold
+        is_para2_left_indent_than_papa1 = x0_2 - x0_1 > left_indent_threshold
+
+        # Check if either para_text1 or para_text2 is empty
+        if not para_1_text or not para_2_text:
+            return False
+
+        # Define the end puncs for a sentence to end and hyphen
+        end_puncs = [".", "?", "!", "。", "？", "！", "…"]
+        hyphen = ["-", "—"]
+
+        # Check if para_text1 ends with either hyphen or non-end punctuation or spaces
+        para_1_end_with_hyphen = para_1_text and para_1_text[-1] in hyphen
+        para_1_end_with_end_punc = para_1_text and para_1_text[-1] in end_puncs
+        para_1_end_with_space = para_1_text and para_1_text[-1] == " "
+        para_1_not_end_with_end_punc = para_1_text and para_1_text[-1] not in end_puncs
+
+        # print_yellow(f"    para_1_end_with_hyphen: {para_1_end_with_hyphen}")
+        # print_yellow(f"    para_1_end_with_end_punc: {para_1_end_with_end_punc}")
+        # print_yellow(f"    para_1_not_end_with_end_punc: {para_1_not_end_with_end_punc}")
+        # print_yellow(f"    para_1_end_with_space: {para_1_end_with_space}")
+
+        if para_1_end_with_hyphen:  # If para_text1 ends with hyphen
+            # print_red(f"para_1 is end with hyphen.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] in hyphen
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_end_punc:  # If para_text1 ends with ending punctuations
+            # print_red(f"para_1 is end with end_punc.")
+            para_2_is_consistent = (
+                para_2_text
+                and (
+                    para_2_text[0]
+                    == " "
+                    # or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].isupper())
+                    # or (self._is_chinese_char(para_2_text[0]))
+                    # or (self._is_other_letter_char(para_2_text[0]))
+                )
+                and not is_para2_left_indent_than_papa1
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_not_end_with_end_punc:  # If para_text1 is not end with ending punctuations
+            # print_red(f"para_1 is NOT end with end_punc.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_alphabet_char(para_2_text[0]))
+                or (self._is_year(para_2_text[0:4]))
+                or (are_two_paras_right_aligned or is_para1_left_indent_than_papa2)
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+                or (self._match_brackets(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                # print(f"para_2 is not consistent.\n")
+                pass
+
+        elif para_1_end_with_space:  # If para_text1 ends with space
+            # print_red(f"para_1 is end with space.")
+            para_2_is_consistent = para_2_text and (
+                para_2_text[0] == " "
+                or (self._is_alphabet_char(para_2_text[0]) and para_2_text[0].islower())
+                or (self._is_chinese_char(para_2_text[0]))
+                or (self._is_other_letter_char(para_2_text[0]))
+            )
+            if para_2_is_consistent:
+                # print(f"para_2 is consistent.\n")
+                return True
+            else:
+                pass
+                # print(f"para_2 is not consistent.\n")
+
+        return False
+
+    def _is_block_consistent(self, block_1, block_2):
+        """
+        This function determines whether block1 and block2 are originally from the same block
+
+        Parameters
+        ----------
+        block1 : dict
+            block1s
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_same : bool
+            True if block1 and block2 are from the same block, else False
+        """
+        return self.__is_same_block_font(block_1, block_2)
+
+    def _is_para_continued(self, para_1, para_2):
+        """
+        This function determines whether para1 and para2 are originally from the same paragraph
+
+        Parameters
+        ----------
+        para1 : dict
+            para1
+        para2 : dict
+            para2
+
+        Returns
+        -------
+        is_same : bool
+            True if para1 and para2 are from the same paragraph, else False
+        """
+        is_para_font_consistent = self._is_para_font_consistent(para_1, para_2)
+        is_para_puncs_consistent = self._is_para_puncs_consistent(para_1, para_2)
+
+        return is_para_font_consistent and is_para_puncs_consistent
+
+    def _are_boundaries_of_block_consistent(self, block_1, block_2):
+        """
+        This function checks if the boundaries of block1 and block2 are consistent
+
+        Parameters
+        ----------
+        block1 : dict
+            block1
+
+        block2 : dict
+            block2
+
+        Returns
+        -------
+        is_consistent : bool
+            True if the boundaries of block1 and block2 are consistent, else False
+        """
+
+        last_line_of_block_1 = block_1["lines"][-1]
+        first_line_of_block_2 = block_2["lines"][0]
+
+        spans_of_last_line_of_block_1 = last_line_of_block_1["spans"]
+        spans_of_first_line_of_block_2 = first_line_of_block_2["spans"]
+
+        font_type_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["font"].lower()
+        font_size_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["size"]
+        font_color_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["color"]
+        font_flags_of_last_line_of_block_1 = spans_of_last_line_of_block_1[0]["flags"]
+
+        font_type_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["font"].lower()
+        font_size_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["size"]
+        font_color_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["color"]
+        font_flags_of_first_line_of_block_2 = spans_of_first_line_of_block_2[0]["flags"]
+
+        return (
+            self.__is_similar_font_type(font_type_of_last_line_of_block_1, font_type_of_first_line_of_block_2)
+            and abs(font_size_of_last_line_of_block_1 - font_size_of_first_line_of_block_2) < 1
+            # and font_color_of_last_line_of_block1 == font_color_of_first_line_of_block2
+            and font_flags_of_last_line_of_block_1 == font_flags_of_first_line_of_block_2
+        )
+
+    def should_merge_next_para(self, curr_para, next_para):
+        """
+        This function checks if the next_para should be merged into the curr_para.
+
+        Parameters
+        ----------
+        curr_para : dict
+            The current paragraph.
+        next_para : dict
+            The next paragraph.
+
+        Returns
+        -------
+        bool
+            True if the next_para should be merged into the curr_para, False otherwise.
+        """
+        if self._is_para_continued(curr_para, next_para):
+            return True
+        else:
+            return False
+
+    def batch_tag_paras(self, pdf_dict):
+        """
+        This function tags the paragraphs in the pdf_dict.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with tagged paragraphs.
+        """
+        the_last_page_id = len(pdf_dict) - 1
+
+        for curr_page_idx, (curr_page_id, curr_page_content) in enumerate(pdf_dict.items()):
+            if curr_page_id.startswith("page_") and curr_page_content.get("para_blocks", []):
+                para_blocks_of_curr_page = curr_page_content["para_blocks"]
+                next_page_idx = curr_page_idx + 1
+                next_page_id = f"page_{next_page_idx}"
+                next_page_content = pdf_dict.get(next_page_id, {})
+
+                for i, current_block in enumerate(para_blocks_of_curr_page):
+                    for para_id, curr_para in current_block["paras"].items():
+                        curr_para["curr_para_location"] = [
+                            curr_page_idx,
+                            current_block["block_id"],
+                            int(para_id.split("_")[-1]),
+                        ]
+                        curr_para["next_para_location"] = None  # 默认设置为None
+                        curr_para["merge_next_para"] = False  # 默认设置为False
+
+                    next_block = para_blocks_of_curr_page[i + 1] if i < len(para_blocks_of_curr_page) - 1 else None
+
+                    if next_block:
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        next_block_first_para_key = list(next_block["paras"].keys())[0]
+                        next_blk_first_para = next_block["paras"][next_block_first_para_key]
+
+                        if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                            curr_blk_last_para["next_para_location"] = [
+                                curr_page_idx,
+                                next_block["block_id"],
+                                int(next_block_first_para_key.split("_")[-1]),
+                            ]
+                            curr_blk_last_para["merge_next_para"] = True
+                    else:
+                        # Handle the case where the next block is in a different page
+                        curr_block_last_para_key = list(current_block["paras"].keys())[-1]
+                        curr_blk_last_para = current_block["paras"][curr_block_last_para_key]
+
+                        while not next_page_content.get("para_blocks", []) and next_page_idx <= the_last_page_id:
+                            next_page_idx += 1
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id, {})
+
+                        if next_page_content.get("para_blocks", []):
+                            next_blk_first_para_key = list(next_page_content["para_blocks"][0]["paras"].keys())[0]
+                            next_blk_first_para = next_page_content["para_blocks"][0]["paras"][next_blk_first_para_key]
+
+                            if self.should_merge_next_para(curr_blk_last_para, next_blk_first_para):
+                                curr_blk_last_para["next_para_location"] = [
+                                    next_page_idx,
+                                    next_page_content["para_blocks"][0]["block_id"],
+                                    int(next_blk_first_para_key.split("_")[-1]),
+                                ]
+                                curr_blk_last_para["merge_next_para"] = True
+
+        return pdf_dict
+
+    def find_block_by_id(self, para_blocks, block_id):
+        """
+        This function finds a block by its id.
+
+        Parameters
+        ----------
+        para_blocks : list
+            List of blocks.
+        block_id : int
+            Id of the block to find.
+
+        Returns
+        -------
+        block : dict
+            The block with the given id.
+        """
+        for blk_idx, block in enumerate(para_blocks):
+            if block.get("block_id") == block_id:
+                return block
+        return None
+
+    def batch_merge_paras(self, pdf_dict):
+        """
+        This function merges the paragraphs in the pdf_dict.
+
+        Parameters
+        ----------
+        pdf_dict : dict
+            PDF dictionary.
+
+        Returns
+        -------
+        pdf_dict : dict
+            PDF dictionary with merged paragraphs.
+        """
+        for page_id, page_content in pdf_dict.items():
+            if page_id.startswith("page_") and page_content.get("para_blocks", []):
+                para_blocks_of_page = page_content["para_blocks"]
+
+                for i in range(len(para_blocks_of_page)):
+                    current_block = para_blocks_of_page[i]
+                    paras = current_block["paras"]
+
+                    for para_id, curr_para in list(paras.items()):
+                        # print(f"current para_id: {para_id}")
+                        # 跳过标题段落
+                        if curr_para.get("is_para_title"):
+                            continue
+
+                        while curr_para.get("merge_next_para"):
+                            curr_para_location = curr_para.get("curr_para_location")
+                            next_para_location = curr_para.get("next_para_location")
+
+                            # print(f"curr_para_location: {curr_para_location}, next_para_location: {next_para_location}")
+                            
+                            if not next_para_location:
+                                break
+
+                            if curr_para_location == next_para_location:
+                                # print_red("The next para is in the same block as the current para.")
+                                curr_para["merge_next_para"] = False
+                                break
+
+                            next_page_idx, next_block_id, next_para_id = next_para_location
+                            next_page_id = f"page_{next_page_idx}"
+                            next_page_content = pdf_dict.get(next_page_id)
+                            if not next_page_content:
+                                break
+
+                            next_block = self.find_block_by_id(next_page_content.get("para_blocks", []), next_block_id)
+
+                            if not next_block:
+                                break
+
+                            next_para = next_block["paras"].get(f"para_{next_para_id}")
+
+                            if not next_para or next_para.get("is_para_title"):
+                                break
+
+                            # 合并段落文本
+                            curr_para_text = curr_para.get("para_text", "")
+                            next_para_text = next_para.get("para_text", "")
+                            curr_para["para_text"] = curr_para_text + " " + next_para_text
+
+                            # 更新 next_para_location
+                            curr_para["next_para_location"] = next_para.get("next_para_location")
+
+                            # 将下一个段落文本置为空，表示已被合并
+                            next_para["para_text"] = ""
+
+                            # 更新 merge_next_para 标记
+                            curr_para["merge_next_para"] = next_para.get("merge_next_para", False)
+
+        return pdf_dict
+
+
+class DrawAnnos:
+    """
+    This class draws annotations on the pdf file
+
+    ----------------------------------------
+                Color Code
+    ----------------------------------------
+        Red: (1, 0, 0)
+        Green: (0, 1, 0)
+        Blue: (0, 0, 1)
+        Yellow: (1, 1, 0) - mix of red and green
+        Cyan: (0, 1, 1) - mix of green and blue
+        Magenta: (1, 0, 1) - mix of red and blue
+        White: (1, 1, 1) - red, green and blue full intensity
+        Black: (0, 0, 0) - no color component whatsoever
+        Gray: (0.5, 0.5, 0.5) - equal and medium intensity of red, green and blue color components
+        Orange: (1, 0.65, 0) - maximum intensity of red, medium intensity of green, no blue component
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def __is_nested_list(self, lst):
+        """
+        This function returns True if the given list is a nested list of any degree.
+        """
+        if isinstance(lst, list):
+            return any(self.__is_nested_list(i) for i in lst) or any(isinstance(i, list) for i in lst)
+        return False
+
+    def __valid_rect(self, bbox):
+        # Ensure that the rectangle is not empty or invalid
+        if isinstance(bbox[0], list):
+            return False  # It's a nested list, hence it can't be valid rect
+        else:
+            return bbox[0] < bbox[2] and bbox[1] < bbox[3]
+
+    def __draw_nested_boxes(self, page, nested_bbox, color=(0, 1, 1)):
+        """
+        This function draws the nested boxes
+
+        Parameters
+        ----------
+        page : fitz.Page
+            page
+        nested_bbox : list
+            nested bbox
+        color : tuple
+            color, by default (0, 1, 1)    # draw with cyan color for combined paragraph
+        """
+        if self.__is_nested_list(nested_bbox):  # If it's a nested list
+            for bbox in nested_bbox:
+                self.__draw_nested_boxes(page, bbox, color)  # Recursively call the function
+        elif self.__valid_rect(nested_bbox):  # If valid rectangle
+            para_rect = fitz.Rect(nested_bbox)
+            para_anno = page.add_rect_annot(para_rect)
+            para_anno.set_colors(stroke=color)  # draw with cyan color for combined paragraph
+            para_anno.set_border(width=1)
+            para_anno.update()
+
+    def draw_annos(self, input_pdf_path, pdf_dic, output_pdf_path):
+        """
+        This function draws annotations on the pdf file.
+
+        Parameters
+        ----------
+        input_pdf_path : str
+            path to the input pdf file
+        pdf_dic : dict
+            pdf dictionary
+        output_pdf_path : str
+            path to the output pdf file
+
+        pdf_dic : dict
+            pdf dictionary
+        """
+        pdf_doc = open_pdf(input_pdf_path)
+
+        if pdf_dic is None:
+            pdf_dic = {}
+
+        if output_pdf_path is None:
+            output_pdf_path = input_pdf_path.replace(".pdf", "_anno.pdf")
+
+        for page_id, page in enumerate(pdf_doc):  # type: ignore
+            page_key = f"page_{page_id}"
+            for ele_key, ele_data in pdf_dic[page_key].items():
+                if ele_key == "para_blocks":
+                    para_blocks = ele_data
+                    for para_block in para_blocks:
+                        if "paras" in para_block.keys():
+                            paras = para_block["paras"]
+                            for para_key, para_content in paras.items():
+                                para_bbox = para_content["para_bbox"]
+                                # print(f"para_bbox: {para_bbox}")
+                                # print(f"is a nested list: {self.__is_nested_list(para_bbox)}")
+                                if self.__is_nested_list(para_bbox) and len(para_bbox) > 1:
+                                    color = (0, 1, 1)
+                                    self.__draw_nested_boxes(
+                                        page, para_bbox, color
+                                    )  # draw with cyan color for combined paragraph
+                                else:
+                                    if self.__valid_rect(para_bbox):
+                                        para_rect = fitz.Rect(para_bbox)
+                                        para_anno = page.add_rect_annot(para_rect)
+                                        para_anno.set_colors(stroke=(0, 1, 0))  # draw with green color for normal paragraph
+                                        para_anno.set_border(width=0.5)
+                                        para_anno.update()
+
+                                is_para_title = para_content["is_para_title"]
+                                if is_para_title:
+                                    if self.__is_nested_list(para_content["para_bbox"]) and len(para_content["para_bbox"]) > 1:
+                                        color = (0, 0, 1)
+                                        self.__draw_nested_boxes(
+                                            page, para_content["para_bbox"], color
+                                        )  # draw with cyan color for combined title
+                                    else:
+                                        if self.__valid_rect(para_content["para_bbox"]):
+                                            para_rect = fitz.Rect(para_content["para_bbox"])
+                                            if self.__valid_rect(para_content["para_bbox"]):
+                                                para_anno = page.add_rect_annot(para_rect)
+                                                para_anno.set_colors(stroke=(0, 0, 1))  # draw with blue color for normal title
+                                                para_anno.set_border(width=0.5)
+                                                para_anno.update()
+
+        pdf_doc.save(output_pdf_path)
+        pdf_doc.close()
+
+
+class ParaProcessPipeline:
+    def __init__(self) -> None:
+        pass
+
+    def para_process_pipeline(self, pdf_info_dict, para_debug_mode=None, input_pdf_path=None, output_pdf_path=None):
+        """
+        This function processes the paragraphs, including:
+        1. Read raw input json file into pdf_dic
+        2. Detect and replace equations
+        3. Combine spans into a natural line
+        4. Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        5. Compute statistics for each block
+        6. Detect titles in the document
+        7. Detect paragraphs inside each block
+        8. Divide the level of the titles
+        9. Detect and combine paragraphs from different blocks into one paragraph
+        10. Check whether the final results after checking headings, dividing paragraphs within blocks, and merging paragraphs between blocks are plausible and reasonable.
+        11. Draw annotations on the pdf file
+
+        Parameters
+        ----------
+        pdf_dic_json_fpath : str
+            path to the pdf dictionary json file.
+            Notice: data noises, including overlap blocks, header, footer, watermark, vertical margin note have been removed already.
+        input_pdf_doc : str
+            path to the input pdf file
+        output_pdf_path : str
+            path to the output pdf file
+
+        Returns
+        -------
+        pdf_dict : dict
+            result dictionary
+        """
+
+        error_info = None
+
+        output_json_file = ""
+        output_dir = ""
+
+        if input_pdf_path is not None:
+            input_pdf_path = os.path.abspath(input_pdf_path)
+
+            # print_green_on_red(f">>>>>>>>>>>>>>>>>>> Process the paragraphs of {input_pdf_path}")
+
+        if output_pdf_path is not None:
+            output_dir = os.path.dirname(output_pdf_path)
+            output_json_file = f"{output_dir}/pdf_dic.json"
+
+        def __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode):
+            """
+            Save the pdf_dic to a json file
+            """
+            output_pdf_file_name = os.path.basename(output_pdf_path)
+            # output_dir = os.path.dirname(output_pdf_path)
+            output_dir = "\\tmp\\pdf_parse"
+            output_pdf_file_name = output_pdf_file_name.replace(".pdf", f"_stage_{stage}.json")
+            pdf_dic_json_fpath = os.path.join(output_dir, output_pdf_file_name)
+
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            if para_debug_mode == "full":
+                with open(pdf_dic_json_fpath, "w", encoding="utf-8") as f:
+                    json.dump(pdf_dic, f, indent=2, ensure_ascii=False)
+
+            # Validate the output already exists
+            if not os.path.exists(pdf_dic_json_fpath):
+                print_red(f"Failed to save the pdf_dic to {pdf_dic_json_fpath}")
+                return None
+            else:
+                print_green(f"Succeed to save the pdf_dic to {pdf_dic_json_fpath}")
+
+            return pdf_dic_json_fpath
+
+        """
+        Preprocess the lines of block
+        """
+        # Combine spans into a natural line
+        rawBlockProcessor = RawBlockProcessor()
+        pdf_dic = rawBlockProcessor.batch_process_blocks(pdf_info_dict)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Check if the paragraphs are inside bboxes passed from "layout_bboxes" key
+        layoutFilter = LayoutFilterProcessor()
+        pdf_dic = layoutFilter.batch_process_blocks(pdf_dic)
+
+        # Compute statistics for each block
+        blockStatisticsCalculator = BlockStatisticsCalculator()
+        pdf_dic = blockStatisticsCalculator.batch_process_blocks(pdf_dic)
+        # print(f"pdf_dic['page_0']['para_blocks'][0]: {pdf_dic['page_0']['para_blocks'][0]}", end="\n\n")
+
+        # Compute statistics for all blocks(namely this pdf document)
+        docStatisticsCalculator = DocStatisticsCalculator()
+        pdf_dic = docStatisticsCalculator.calc_stats_of_doc(pdf_dic)
+        # print(f"pdf_dic['statistics']: {pdf_dic['statistics']}", end="\n\n")
+
+        # Dump the first three stages of pdf_dic to a json file
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="0", para_debug_mode=para_debug_mode)
+
+        """
+        Detect titles in the document
+        """
+        doc_statistics = pdf_dic["statistics"]
+        titleProcessor = TitleProcessor(doc_statistics)
+        pdf_dic = titleProcessor.batch_detect_titles(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="1", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and divide the level of the titles
+        """
+        titleProcessor = TitleProcessor()
+
+        pdf_dic = titleProcessor.batch_recog_title_level(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="2", para_debug_mode=para_debug_mode)
+
+        """
+        Detect and split paragraphs inside each block
+        """
+        blockInnerParasProcessor = BlockTerminationProcessor()
+
+        pdf_dic = blockInnerParasProcessor.batch_process_blocks(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="3", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Detect and combine paragraphs from different blocks into one paragraph
+        """
+        blockContinuationProcessor = BlockContinuationProcessor()
+
+        pdf_dic = blockContinuationProcessor.batch_tag_paras(pdf_dic)
+        pdf_dic = blockContinuationProcessor.batch_merge_paras(pdf_dic)
+
+        if para_debug_mode == "full":
+            pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode=para_debug_mode)
+
+        # pdf_dic_json_fpath = __save_pdf_dic(pdf_dic, output_pdf_path, stage="4", para_debug_mode="full")
+        # print_green(f"pdf_dic_json_fpath: {pdf_dic_json_fpath}")
+
+        """
+        Discard pdf files by checking exceptions and return the error info to the caller
+        """
+        discardByException = DiscardByException()
+
+        is_discard_by_single_line_block = discardByException.discard_by_single_line_block(
+            pdf_dic, exception=DenseSingleLineBlockException()
+        )
+        is_discard_by_title_detection = discardByException.discard_by_title_detection(
+            pdf_dic, exception=TitleDetectionException()
+        )
+        is_discard_by_title_level = discardByException.discard_by_title_level(pdf_dic, exception=TitleLevelException())
+        is_discard_by_split_para = discardByException.discard_by_split_para(pdf_dic, exception=ParaSplitException())
+        is_discard_by_merge_para = discardByException.discard_by_merge_para(pdf_dic, exception=ParaMergeException())
+
+        if is_discard_by_single_line_block is not None:
+            error_info = is_discard_by_single_line_block
+        elif is_discard_by_title_detection is not None:
+            error_info = is_discard_by_title_detection
+        elif is_discard_by_title_level is not None:
+            error_info = is_discard_by_title_level
+        elif is_discard_by_split_para is not None:
+            error_info = is_discard_by_split_para
+        elif is_discard_by_merge_para is not None:
+            error_info = is_discard_by_merge_para
+
+        if error_info is not None:
+            return pdf_dic, error_info
+
+        """
+        Dump the final pdf_dic to a json file
+        """
+        if para_debug_mode is not None:
+            with open(output_json_file, "w", encoding="utf-8") as f:
+                json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
+
+        """
+        Draw the annotations
+        """
+        if para_debug_mode is not None:
+            drawAnnos = DrawAnnos()
+            drawAnnos.draw_annos(input_pdf_path, pdf_dic, output_pdf_path)
+
+        """
+        Remove the intermediate files which are generated in the process of paragraph processing if debug_mode is simple
+        """
+        if para_debug_mode is not None:
+            for fpath in os.listdir(output_dir):
+                if fpath.endswith(".json") and "stage" in fpath:
+                    os.remove(os.path.join(output_dir, fpath))
+
+        return pdf_dic, error_info
+
+
+"""
+Run this script to test the function with Command: 
+
+python detect_para.py [pdf_path] [output_pdf_path]
+
+Params:
+- pdf_path: the path of the pdf file
+- output_pdf_path: the path of the output pdf file
+"""
+
+if __name__ == "__main__":
+    DEFAULT_PDF_PATH = (
+        "app/pdf_toolbox/tests/assets/paper/paper.pdf" if os.name != "nt" else "app\\pdf_toolbox\\tests\\assets\\paper\\paper.pdf"
+    )
+    input_pdf_path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_PDF_PATH
+    output_pdf_path = sys.argv[2] if len(sys.argv) > 2 else input_pdf_path.split(".")[0] + "_recogPara.pdf"
+    output_json_path = sys.argv[3] if len(sys.argv) > 3 else input_pdf_path.split(".")[0] + "_recogPara.json"
+
+    import stat
+
+    # Remove existing output file if it exists
+    if os.path.exists(output_pdf_path):
+        os.chmod(output_pdf_path, stat.S_IWRITE)
+        os.remove(output_pdf_path)
+
+    input_pdf_doc = open_pdf(input_pdf_path)
+
+    # postprocess the paragraphs
+    paraProcessPipeline = ParaProcessPipeline()
+
+    # parse paragraph and save to json file
+    pdf_dic = {}
+
+    blockInnerParasProcessor = BlockTerminationProcessor()
+
+    """
+    Construct the pdf dictionary.
+    """
+
+    for page_id, page in enumerate(input_pdf_doc):  # type: ignore
+        # print(f"Processing page {page_id}")
+        # print(f"page: {page}")
+        raw_blocks = page.get_text("dict")["blocks"]
+
+        # Save text blocks to "preproc_blocks"
+        preproc_blocks = []
+        for block in raw_blocks:
+            if block["type"] == 0:
+                preproc_blocks.append(block)
+
+        layout_bboxes = []
+
+        # Construct the pdf dictionary as schema above
+        page_dict = {
+            "para_blocks": None,
+            "preproc_blocks": preproc_blocks,
+            "images": None,
+            "tables": None,
+            "interline_equations": None,
+            "inline_equations": None,
+            "layout_bboxes": None,
+            "pymu_raw_blocks": None,
+            "global_statistic": None,
+            "droped_text_block": None,
+            "droped_image_block": None,
+            "droped_table_block": None,
+            "image_backup": None,
+            "table_backup": None,
+        }
+
+        pdf_dic[f"page_{page_id}"] = page_dict
+
+    # print(f"pdf_dic: {pdf_dic}")
+
+    with open(output_json_path, "w", encoding="utf-8") as f:
+        json.dump(pdf_dic, f, ensure_ascii=False, indent=4)
+
+    pdf_dic = paraProcessPipeline.para_process_pipeline(output_json_path, input_pdf_doc, output_pdf_path)
diff --git a/magic_pdf/post_proc/pdf_post_filter.py b/magic_pdf/post_proc/pdf_post_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..862e666079ea999e45cfd9a0563051967de709a2
--- /dev/null
+++ b/magic_pdf/post_proc/pdf_post_filter.py
@@ -0,0 +1,67 @@
+from loguru import logger
+
+from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
+from magic_pdf.libs.drop_reason import DropReason
+
+
+def __is_pseudo_single_column(page_info) -> bool:
+    """
+    判断一个页面是否伪单列。
+
+    Args:
+        page_info (dict): 页面信息字典，包括'_layout_tree'和'preproc_blocks'。
+
+    Returns:
+        Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info)，否则返回(False, None)。
+
+    """
+    layout_tree = page_info['_layout_tree']
+    layout_column_width = get_columns_cnt_of_layout(layout_tree)
+    if layout_column_width == 1:
+        text_blocks = page_info['preproc_blocks']
+        # 遍历每一个text_block
+        for text_block in text_blocks:
+            lines = text_block['lines']
+            num_lines = len(lines)
+            num_satisfying_lines = 0
+
+            for i in range(num_lines - 1):
+                current_line = lines[i]
+                next_line = lines[i + 1]
+
+                # 获取当前line和下一个line的bbox属性
+                current_bbox = current_line['bbox']
+                next_bbox = next_line['bbox']
+
+                # 检查是否满足条件
+                if next_bbox[0] > current_bbox[2] or next_bbox[2] < current_bbox[0]:
+                    num_satisfying_lines += 1
+            # 如果有一半以上的line满足条件，就drop
+            # print("num_satisfying_lines:", num_satisfying_lines, "num_lines:", num_lines)
+            if num_lines > 20:
+                radio = num_satisfying_lines / num_lines
+                if radio >= 0.5:
+                    extra_info = f"{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}"
+                    block_text = []
+                    for line in lines:
+                        if line['spans']:
+                            for span in line['spans']:
+                                block_text.append(span['text'])
+                    logger.warning(f"pseudo_single_column block_text: {block_text}")
+                    return True, extra_info
+
+    return False, None
+
+
+def pdf_post_filter(page_info) -> tuple:
+    """
+    return:(True|False, err_msg)
+        True, 如果pdf符合要求
+        False, 如果pdf不符合要求
+
+    """
+    bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
+    if bool_is_pseudo_single_column:
+        return False, {"_need_drop": True, "_drop_reason": DropReason.PSEUDO_SINGLE_COLUMN, "extra_info": extra_info}
+
+    return True, None
\ No newline at end of file
diff --git a/magic_pdf/post_proc/remove_footnote.py b/magic_pdf/post_proc/remove_footnote.py
new file mode 100644
index 0000000000000000000000000000000000000000..976d3a6a4498f8bb3467a38b635420bede8c9396
--- /dev/null
+++ b/magic_pdf/post_proc/remove_footnote.py
@@ -0,0 +1,153 @@
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
+import collections      # 统计库
+
+
+
+def is_below(bbox1, bbox2):
+    # 如果block1的上边y坐标大于block2的下边y坐标，那么block1在block2下面
+    return bbox1[1] > bbox2[3]
+
+
+def merge_bboxes(bboxes):
+    # 找出所有blocks的最小x0，最大y1，最大x1，最小y0，这就是合并后的bbox
+    x0 = min(bbox[0] for bbox in bboxes)
+    y0 = min(bbox[1] for bbox in bboxes)
+    x1 = max(bbox[2] for bbox in bboxes)
+    y1 = max(bbox[3] for bbox in bboxes)
+    return [x0, y0, x1, y1]
+
+
+def merge_footnote_blocks(page_info, main_text_font):
+    page_info['merged_bboxes'] = []
+    for layout in page_info['layout_bboxes']:
+        # 找出layout中的所有footnote blocks和preproc_blocks
+        footnote_bboxes = [block for block in page_info['footnote_bboxes_tmp'] if _is_in(block, layout['layout_bbox'])]
+        # 如果没有footnote_blocks，就跳过这个layout
+        if not footnote_bboxes:
+            continue
+
+        preproc_blocks = [block for block in page_info['preproc_blocks'] if _is_in(block['bbox'], layout['layout_bbox'])]
+        # preproc_bboxes = [block['bbox'] for block in preproc_blocks]
+        font_names = collections.Counter()
+        if len(preproc_blocks) > 0:
+            # 存储每一行的文本块大小的列表
+            line_sizes = []
+            # 存储每个文本块的平均行大小
+            block_sizes = []
+            for block in preproc_blocks:
+                block_line_sizes = []
+                block_fonts = collections.Counter()
+                for line in block['lines']:
+                    # 提取每个span的size属性，并计算行大小
+                    span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
+                    if span_sizes:
+                        line_size = sum(span_sizes) / len(span_sizes)
+                        line_sizes.append(line_size)
+                        block_line_sizes.append(line_size)
+                    span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                 'font' in span and len(span['text']) > 0]
+                    if span_font:
+                        # # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
+                        # font_names.append(font_name for font_name in span_font)
+                        # block_fonts.append(font_name for font_name in span_font)
+                        for font, count in span_font:
+                            # font_names.extend([font] * count)
+                            # block_fonts.extend([font] * count)
+                            font_names[font] += count
+                            block_fonts[font] += count
+                if block_line_sizes:
+                    # 计算文本块的平均行大小
+                    block_size = sum(block_line_sizes) / len(block_line_sizes)
+                    block_font = block_fonts.most_common(1)[0][0]
+                    block_sizes.append((block, block_size, block_font))
+
+            # 计算main_text_size
+            # main_text_font = font_names.most_common(1)[0][0]
+            main_text_size = collections.Counter(line_sizes).most_common(1)[0][0]
+        else:
+            continue
+
+        need_merge_bboxes = []
+        # 任何一个下面有正文block的footnote bbox都是假footnote
+        for footnote_bbox in footnote_bboxes:
+            # 检测footnote下面是否有正文block(正文block需满足，block平均size大于等于main_text_size，且block行数大于等于5)
+            main_text_bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if
+                                      is_below(block['bbox'], footnote_bbox) and
+                                      sum([size >= main_text_size,
+                                           len(block['lines']) >= 5,
+                                           block_font == main_text_font])
+                                      >= 2]
+            # 如果main_text_bboxes_below不为空，说明footnote下面有正文block，这个footnote不成立，跳过
+            if len(main_text_bboxes_below) > 0:
+                continue
+            else:
+                # 否则，说明footnote下面没有正文block，这个footnote成立，添加到待merge的footnote_bboxes中
+                need_merge_bboxes.append(footnote_bbox)
+        if len(need_merge_bboxes) == 0:
+            continue
+        # 找出最靠上的footnote block
+        top_footnote_bbox = min(need_merge_bboxes, key=lambda bbox: bbox[1])
+        # 找出所有在top_footnote_block下面的preproc_blocks，并确保这些preproc_blocks的平均行大小小于main_text_size
+        bboxes_below = [block['bbox'] for block, size, block_font in block_sizes if is_below(block['bbox'], top_footnote_bbox)]
+        # # 找出所有在top_footnote_block下面的preproc_blocks
+        # bboxes_below = [bbox for bbox in preproc_bboxes if is_below(bbox, top_footnote_bbox)]
+        # 合并top_footnote_block和blocks_below
+        merged_bbox = merge_bboxes([top_footnote_bbox] + bboxes_below)
+        # 添加到新的footnote_bboxes_tmp中
+        page_info['merged_bboxes'].append(merged_bbox)
+    return page_info
+
+
+def remove_footnote_blocks(page_info):
+    if page_info.get('merged_bboxes'):
+        # 从文字中去掉footnote
+        remain_text_blocks, removed_footnote_text_blocks = remove_footnote_text(page_info['preproc_blocks'], page_info['merged_bboxes'])
+        # 从图片中去掉footnote
+        image_blocks, removed_footnote_imgs_blocks = remove_footnote_image(page_info['images'], page_info['merged_bboxes'])
+        # 更新page_info
+        page_info['preproc_blocks'] = remain_text_blocks
+        page_info['images'] = image_blocks
+        page_info['droped_text_block'].extend(removed_footnote_text_blocks)
+        page_info['droped_image_block'].extend(removed_footnote_imgs_blocks)
+        # 删除footnote_bboxes_tmp和merged_bboxes
+        del page_info['merged_bboxes']
+    del page_info['footnote_bboxes_tmp']
+    return page_info
+
+
+def remove_footnote_text(raw_text_block, footnote_bboxes):
+    """
+    :param raw_text_block: str类型，是当前页的文本内容
+    :param footnoteBboxes: list类型，是当前页的脚注bbox
+    """
+    footnote_text_blocks = []
+    for block in raw_text_block:
+        text_bbox = block['bbox']
+        # TODO 更严谨点在line级别做
+        if any([_is_in_or_part_overlap(text_bbox, footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            # if any([text_bbox[3]>=footnote_bbox[1] for footnote_bbox in footnote_bboxes]):
+            block['tag'] = 'footnote'
+            footnote_text_blocks.append(block)
+            # raw_text_block.remove(block)
+
+    # 移除，不能再内部移除，否则会出错
+    for block in footnote_text_blocks:
+        raw_text_block.remove(block)
+
+    return raw_text_block, footnote_text_blocks
+
+
+def remove_footnote_image(image_blocks, footnote_bboxes):
+    """
+    :param image_bboxes: list类型，是当前页的图片bbox(结构体)
+    :param footnoteBboxes: list类型，是当前页的脚注bbox
+    """
+    footnote_imgs_blocks = []
+    for image_block in image_blocks:
+        if any([_is_in(image_block['bbox'], footnote_bbox) for footnote_bbox in footnote_bboxes]):
+            footnote_imgs_blocks.append(image_block)
+
+    for footnote_imgs_block in footnote_imgs_blocks:
+        image_blocks.remove(footnote_imgs_block)
+
+    return image_blocks, footnote_imgs_blocks
\ No newline at end of file
diff --git a/magic_pdf/pre_proc/__init__.py b/magic_pdf/pre_proc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/pre_proc/citationmarker_remove.py b/magic_pdf/pre_proc/citationmarker_remove.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a2c485f78764e32354f95f75965d2546d98fe3f
--- /dev/null
+++ b/magic_pdf/pre_proc/citationmarker_remove.py
@@ -0,0 +1,157 @@
+"""
+去掉正文的引文引用marker
+https://aicarrier.feishu.cn/wiki/YLOPwo1PGiwFRdkwmyhcZmr0n3d
+"""
+import re
+# from magic_pdf.libs.nlp_utils import NLPModels
+
+
+# __NLP_MODEL = NLPModels()
+
+def check_1(spans, cur_span_i):
+    """寻找前一个char,如果是句号，逗号，那么就是角标"""
+    if cur_span_i==0:
+        return False # 不是角标
+    pre_span = spans[cur_span_i-1]
+    pre_char = pre_span['chars'][-1]['c']
+    if pre_char in ['。', '，', '.', ',']:
+        return True
+    
+    return False
+
+
+# def check_2(spans, cur_span_i):
+#     """检查前面一个span的最后一个单词，如果长度大于5，全都是字母，并且不含大写，就是角标"""
+#     pattern = r'\b[A-Z]\.\s[A-Z][a-z]*\b' # 形如A. Bcde, L. Bcde, 人名的缩写
+#
+#     if cur_span_i==0 and len(spans)>1:
+#         next_span = spans[cur_span_i+1]
+#         next_txt = "".join([c['c'] for c in next_span['chars']])
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(next_txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, next_txt):
+#             return True
+#
+#         return False # 不是角标
+#     elif cur_span_i==0 and len(spans)==1: # 角标占用了整行？谨慎删除
+#         return False
+#
+#     # 如果这个span是最后一个span,
+#     if cur_span_i==len(spans)-1:
+#         pre_span = spans[cur_span_i-1]
+#         pre_txt = "".join([c['c'] for c in pre_span['chars']])
+#         pre_word = pre_txt.split(' ')[-1]
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(pre_txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, pre_txt):
+#             return True
+#
+#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+#     else: # 既不是第一个span，也不是最后一个span，那么此时检查一下这个角标距离前后哪个单词更近就属于谁的角标
+#         pre_span = spans[cur_span_i-1]
+#         next_span = spans[cur_span_i+1]
+#         cur_span = spans[cur_span_i]
+#         # 找到前一个和后一个span里的距离最近的单词
+#         pre_distance = 10000 # 一个很大的数
+#         next_distance = 10000 # 一个很大的数
+#         for c in pre_span['chars'][::-1]:
+#             if c['c'].isalpha():
+#                 pre_distance = cur_span['bbox'][0] - c['bbox'][2]
+#                 break
+#         for c in next_span['chars']:
+#             if c['c'].isalpha():
+#                 next_distance = c['bbox'][0] - cur_span['bbox'][2]
+#                 break
+#
+#         if pre_distance<next_distance:
+#             belong_to_span = pre_span
+#         else:
+#             belong_to_span = next_span
+#
+#         txt = "".join([c['c'] for c in belong_to_span['chars']])
+#         pre_word = txt.split(' ')[-1]
+#         result = __NLP_MODEL.detect_entity_catgr_using_nlp(txt)
+#         if result in ["PERSON", "GPE", "ORG"]:
+#             return True
+#
+#         if re.findall(pattern, txt):
+#             return True
+#
+#         return len(pre_word) > 5 and pre_word.isalpha() and pre_word.islower()
+
+
+def check_3(spans, cur_span_i):
+    """上标里有[], 有*， 有-， 有逗号"""
+    # 如[2-3],[22]  
+    # 如 2,3,4
+    cur_span_txt = ''.join(c['c'] for c in spans[cur_span_i]['chars']).strip()
+    bad_char = ['[', ']', '*', ',']
+
+    if any([c in cur_span_txt for c in bad_char]) and any(character.isdigit() for character in cur_span_txt):
+        return True
+
+    # 如2-3, a-b
+    patterns = [r'\d+-\d+', r'[a-zA-Z]-[a-zA-Z]', r'[a-zA-Z],[a-zA-Z]']
+    for pattern in patterns:  
+        match = re.match(pattern, cur_span_txt)
+        if match is not None:
+            return True
+
+    return False
+
+
+def remove_citation_marker(with_char_text_blcoks):
+    for blk in with_char_text_blcoks:
+        for line in blk['lines']:
+            # 如果span里的个数少于2个，那只能忽略，角标不可能自己独占一行
+            if len(line['spans'])<=1:
+                continue
+
+            # 找到高度最高的span作为位置比较的基准
+            max_hi_span = line['spans'][0]['bbox']
+            min_font_sz = 10000 # line里最小的字体
+            max_font_sz = 0   # line里最大的字体
+                
+            for s in line['spans']:
+                if max_hi_span[3]-max_hi_span[1]<s['bbox'][3]-s['bbox'][1]:
+                    max_hi_span = s['bbox']
+                if min_font_sz>s['size']:
+                    min_font_sz = s['size']
+                if max_font_sz<s['size']:
+                    max_font_sz = s['size']
+                        
+            base_span_mid_y = (max_hi_span[3]+max_hi_span[1])/2
+            
+            
+            span_to_del = []
+            for i, span in enumerate(line['spans']):
+                span_hi = span['bbox'][3]-span['bbox'][1]
+                span_mid_y = (span['bbox'][3]+span['bbox'][1])/2
+                span_font_sz = span['size']
+                
+                if max_font_sz-span_font_sz<1: # 先以字体过滤正文，如果是正文就不再继续判断了
+                    continue
+                
+                if (base_span_mid_y-span_mid_y)/span_hi>0.2 or (base_span_mid_y-span_mid_y>0 and abs(span_font_sz-min_font_sz)/min_font_sz<0.1):
+                    """
+                    1. 它的前一个char如果是句号或者逗号的话，那么肯定是角标而不是公式
+                    2. 如果这个角标的前面是一个单词（长度大于5）而不是任何大写或小写的短字母的话 应该也是角标
+                    3. 上标里有数字和逗号或者数字+星号的组合，方括号，一般肯定就是角标了
+                    4. 这个角标属于前文还是后文要根据距离来判断，如果距离前面的文本太近，那么就是前面的角标，否则就是后面的角标
+                    """
+                    if (check_1(line['spans'], i) or
+                        # check_2(line['spans'], i) or
+                        check_3(line['spans'], i)
+                    ):
+                        """删除掉这个角标：删除这个span, 同时还要更新line的text"""
+                        span_to_del.append(span)
+            if len(span_to_del)>0:
+                for span in span_to_del:
+                    line['spans'].remove(span)
+                line['text'] = ''.join([c['c'] for s in line['spans'] for c in s['chars']])
+    
+    return with_char_text_blcoks
diff --git a/magic_pdf/pre_proc/construct_page_dict.py b/magic_pdf/pre_proc/construct_page_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2f83c1005ba507dee65f65aaa52ee34b805b477
--- /dev/null
+++ b/magic_pdf/pre_proc/construct_page_dict.py
@@ -0,0 +1,72 @@
+def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
+                             interline_eq_info, raw_pymu_blocks,
+                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
+                             layout_tree,
+                             page_w, page_h, footnote_bboxes_tmp):
+    """
+    
+    """
+    return_dict = {}
+
+    return_dict['para_blocks'] = {}
+    return_dict['preproc_blocks'] = text_blocks_preproc
+    return_dict['images'] = image_info
+    return_dict['tables'] = table_info
+    return_dict['interline_equations'] = interline_eq_info
+    return_dict['inline_equations'] = inline_eq_info
+    return_dict['layout_bboxes'] = layout_bboxes
+    return_dict['pymu_raw_blocks'] = raw_pymu_blocks
+    return_dict['global_statistic'] = {}
+
+    return_dict['droped_text_block'] = removed_text_blocks
+    return_dict['droped_image_block'] = removed_image_blocks
+    return_dict['droped_table_block'] = []
+    return_dict['image_backup'] = images_backup
+    return_dict['table_backup'] = []
+    return_dict['page_idx'] = page_id
+    return_dict['page_size'] = [page_w, page_h]
+    return_dict['_layout_tree'] = layout_tree  # 辅助分析layout作用
+    return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
+
+    return return_dict
+
+
+def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                 images, tables, interline_equations, inline_equations,
+                                 dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
+                                 need_remove_spans_bboxes_dict):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'inline_equations': inline_equations,
+        'droped_text_block': dropped_text_block,
+        'droped_image_block': dropped_image_block,
+        'droped_table_block': dropped_table_block,
+        'dropped_equation_block': dropped_equation_block,
+        'droped_bboxes': need_remove_spans_bboxes_dict,
+    }
+    return return_dict
+
+
+def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
+                                    images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
+    return_dict = {
+        'preproc_blocks': blocks,
+        'layout_bboxes': layout_bboxes,
+        'page_idx': page_id,
+        'page_size': [page_w, page_h],
+        '_layout_tree': layout_tree,
+        'images': images,
+        'tables': tables,
+        'interline_equations': interline_equations,
+        'discarded_blocks': discarded_blocks,
+        'need_drop': need_drop,
+        'drop_reason': drop_reason,
+    }
+    return return_dict
diff --git a/magic_pdf/pre_proc/cut_image.py b/magic_pdf/pre_proc/cut_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..18ee65129d0d07b14bbd5aadec16f91b646cf06d
--- /dev/null
+++ b/magic_pdf/pre_proc/cut_image.py
@@ -0,0 +1,71 @@
+from loguru import logger
+
+from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.ocr_content_type import ContentType
+from magic_pdf.libs.pdf_image_tools import cut_image
+
+
+def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)
+
+    for span in spans:
+        span_type = span['type']
+        if span_type == ContentType.Image:
+            if not check_img_bbox(span['bbox']):
+                continue
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('images'),
+                                           imageWriter=imageWriter)
+        elif span_type == ContentType.Table:
+            if not check_img_bbox(span['bbox']):
+                continue
+            span['image_path'] = cut_image(span['bbox'], page_id, page, return_path=return_path('tables'),
+                                           imageWriter=imageWriter)
+
+    return spans
+
+
+def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
+                              image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
+                              equation_inline_bboxes: list,
+                              equation_interline_bboxes: list, imageWriter) -> dict:
+    """
+    返回一个dict, key为bbox, 值是图片地址
+    """
+    image_info = []
+    image_backup_info = []
+    table_info = []
+    inline_eq_info = []
+    interline_eq_info = []
+
+    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
+
+    def return_path(type):
+        return join_path(pdf_bytes_md5, type)
+
+    for bbox in image_bboxes:
+        if not check_img_bbox(bbox):
+            continue
+        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
+        image_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in images_overlap_backup:
+        if not check_img_bbox(bbox):
+            continue
+        image_path = cut_image(bbox, page_num, page, return_path("images"), imageWriter)
+        image_backup_info.append({"bbox": bbox, "image_path": image_path})
+
+    for bbox in table_bboxes:
+        if not check_img_bbox(bbox):
+            continue
+        image_path = cut_image(bbox, page_num, page, return_path("tables"), imageWriter)
+        table_info.append({"bbox": bbox, "image_path": image_path})
+
+    return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
+
+
+def check_img_bbox(bbox) -> bool:
+    if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
+        logger.warning(f"image_bboxes: 错误的box, {bbox}")
+        return False
+    return True
diff --git a/magic_pdf/pre_proc/detect_equation.py b/magic_pdf/pre_proc/detect_equation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f395030c4233db92023512b201f896e5d814f03c
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_equation.py
@@ -0,0 +1,134 @@
+from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio              # 正则
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+
+
+def __solve_contain_bboxs(all_bbox_list: list):
+
+    """将两个公式的bbox做判断是否有包含关系，若有的话则删掉较小的bbox"""
+
+    dump_list = []
+    for i in range(len(all_bbox_list)):
+        for j in range(i + 1, len(all_bbox_list)):
+            # 获取当前两个值
+            bbox1 = all_bbox_list[i][:4]
+            bbox2 = all_bbox_list[j][:4]
+            
+            # 删掉较小的框
+            if _is_in(bbox1, bbox2):
+                dump_list.append(all_bbox_list[i])
+            elif _is_in(bbox2, bbox1):
+                dump_list.append(all_bbox_list[j])
+            else:
+                ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+                if ratio > 0.7:
+                    s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) 
+                    s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+                    if s2 > s1:  
+                        dump_list.append(all_bbox_list[i])
+                    else:
+                        dump_list.append(all_bbox_list[i]) 
+
+    # 遍历需要删除的列表中的每个元素
+    for item in dump_list:
+        
+        while item in all_bbox_list:
+            all_bbox_list.remove(item)
+    return all_bbox_list
+
+
+def parse_equations(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    
+
+    #--------- 通过json_from_DocXchain来获取 table ---------#
+    equationEmbedding_from_DocXChain_bboxs = []
+    equationIsolated_from_DocXChain_bboxs = []
+    
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # equation
+        img_suffix = f"{page_ID}_{int(L)}_{int(U)}_{int(R)}_{int(D)}"
+        if xf['category_id'] == 13 and xf['score'] >= 0.3:      
+            latex_text = xf.get("latex", "EmptyInlineEquationResult")
+            debugable_latex_text = f"{latex_text}|{img_suffix}"
+            equationEmbedding_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
+        if xf['category_id'] == 14 and xf['score'] >= 0.3:
+            latex_text = xf.get("latex", "EmptyInterlineEquationResult")
+            debugable_latex_text = f"{latex_text}|{img_suffix}"
+            equationIsolated_from_DocXChain_bboxs.append((L, U, R, D, latex_text))
+    
+    #---------------------------------------- 排序，编号，保存 -----------------------------------------#
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    
+    equationEmbedding_from_DocXChain_names = []
+    equationEmbedding_ID = 0
+    
+    equationIsolated_from_DocXChain_names = []
+    equationIsolated_ID = 0
+    
+    for L, U, R, D, _ in equationEmbedding_from_DocXChain_bboxs:
+        if not(L < R and U < D):
+            continue
+        try:
+            # cur_equation = page.get_pixmap(clip=(L,U,R,D))
+            new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationEmbedding_ID)        # 公式name
+            # cur_equation.save(res_dir_path + '/' + new_equation_name)                       # 把公式存出在新建的文件夹，并命名
+            equationEmbedding_from_DocXChain_names.append(new_equation_name)                         # 把公式的名字存在list中，方便在md中插入引用
+            equationEmbedding_ID += 1
+        except:
+            pass
+
+    for L, U, R, D, _ in equationIsolated_from_DocXChain_bboxs:
+        if not(L < R and U < D):
+            continue
+        try:
+            # cur_equation = page.get_pixmap(clip=(L,U,R,D))
+            new_equation_name = "equationEmbedding_{}_{}.png".format(page_ID, equationIsolated_ID)        # 公式name
+            # cur_equation.save(res_dir_path + '/' + new_equation_name)                       # 把公式存出在新建的文件夹，并命名
+            equationIsolated_from_DocXChain_names.append(new_equation_name)                         # 把公式的名字存在list中，方便在md中插入引用
+            equationIsolated_ID += 1
+        except:
+            pass
+    
+    equationEmbedding_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    equationIsolated_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    
+    
+    """根据pdf可视区域，调整bbox的坐标"""
+    cropbox = page.cropbox
+    if cropbox[0]!=page.rect[0] or cropbox[1]!=page.rect[1]:
+        for eq_box in equationEmbedding_from_DocXChain_bboxs:
+            eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
+        for eq_box in equationIsolated_from_DocXChain_bboxs:
+            eq_box = [eq_box[0]+cropbox[0], eq_box[1]+cropbox[1], eq_box[2]+cropbox[0], eq_box[3]+cropbox[1], eq_box[4]]
+        
+    deduped_embedding_eq_bboxes = __solve_contain_bboxs(equationEmbedding_from_DocXChain_bboxs)
+    return deduped_embedding_eq_bboxes, equationIsolated_from_DocXChain_bboxs
diff --git a/magic_pdf/pre_proc/detect_footer_by_model.py b/magic_pdf/pre_proc/detect_footer_by_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c1fbf38b6c5a61b477b0aab594f966ef3d2676e
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_footer_by_model.py
@@ -0,0 +1,64 @@
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 footer ---------#
+    footer_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 6 and xf['score'] >= 0.3:
+            footer_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    footer_final_names = []
+    footer_final_bboxs = []
+    footer_ID = 0
+    for L, U, R, D in footer_bbox_from_DocXChain:
+        # cur_footer = page.get_pixmap(clip=(L,U,R,D))
+        new_footer_name = "footer_{}_{}.png".format(page_ID, footer_ID)    # 脚注name
+        # cur_footer.save(res_dir_path + '/' + new_footer_name)           # 把页脚存储在新建的文件夹，并命名
+        footer_final_names.append(new_footer_name)                        # 把脚注的名字存在list中
+        footer_final_bboxs.append((L, U, R, D))
+        footer_ID += 1
+        
+
+    footer_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_footer_bboxs = footer_final_bboxs
+    return curPage_all_footer_bboxs
+
diff --git a/magic_pdf/pre_proc/detect_footer_header_by_statistics.py b/magic_pdf/pre_proc/detect_footer_header_by_statistics.py
new file mode 100644
index 0000000000000000000000000000000000000000..340965d0ea92c3aa75e4a428da4cdff728db1124
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_footer_header_by_statistics.py
@@ -0,0 +1,284 @@
+from collections import defaultdict
+
+from magic_pdf.libs.boxbase import calculate_iou
+
+
+def compare_bbox_with_list(bbox, bbox_list, tolerance=1):
+    return any(all(abs(a - b) < tolerance for a, b in zip(bbox, common_bbox)) for common_bbox in bbox_list)
+
+def is_single_line_block(block):
+    # Determine based on the width and height of the block
+    block_width = block["X1"] - block["X0"]
+    block_height = block["bbox"][3] - block["bbox"][1]
+
+    # If the height of the block is close to the average character height and the width is large, it is considered a single line
+    return block_height <= block["avg_char_height"] * 3 and block_width > block["avg_char_width"] * 3
+
+def get_most_common_bboxes(bboxes, page_height, position="top", threshold=0.25, num_bboxes=3, min_frequency=2):
+    """
+    This function gets the most common bboxes from the bboxes
+
+    Parameters
+    ----------
+    bboxes : list
+        bboxes
+    page_height : float
+        height of the page
+    position : str, optional
+        "top" or "bottom", by default "top"
+    threshold : float, optional
+        threshold, by default 0.25
+    num_bboxes : int, optional
+        number of bboxes to return, by default 3
+    min_frequency : int, optional
+        minimum frequency of the bbox, by default 2
+
+    Returns
+    -------
+    common_bboxes : list
+        common bboxes
+    """
+    # Filter bbox by position
+    if position == "top":
+        filtered_bboxes = [bbox for bbox in bboxes if bbox[1] < page_height * threshold]
+    else:
+        filtered_bboxes = [bbox for bbox in bboxes if bbox[3] > page_height * (1 - threshold)]
+
+    # Find the most common bbox
+    bbox_count = defaultdict(int)
+    for bbox in filtered_bboxes:
+        bbox_count[tuple(bbox)] += 1
+
+    # Get the most frequently occurring bbox, but only consider it when the frequency exceeds min_frequency
+    common_bboxes = [
+        bbox for bbox, count in sorted(bbox_count.items(), key=lambda item: item[1], reverse=True) if count >= min_frequency
+    ][:num_bboxes]
+    return common_bboxes
+
+def detect_footer_header2(result_dict, similarity_threshold=0.5):
+    """
+    This function detects the header and footer of the document.
+
+    Parameters
+    ----------
+    result_dict : dict
+        result dictionary
+
+    Returns
+    -------
+    result_dict : dict
+        result dictionary
+    """
+    # Traverse all blocks in the document
+    single_line_blocks = 0
+    total_blocks = 0
+    single_line_blocks = 0
+
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    total_blocks += 1
+                    if is_single_line_block(block):
+                        single_line_blocks += 1
+
+    # If there are no blocks, skip the header and footer detection
+    if total_blocks == 0:
+        print("No blocks found. Skipping header/footer detection.")
+        return result_dict
+
+    # If most of the blocks are single-line, skip the header and footer detection
+    if single_line_blocks / total_blocks > 0.5:  # 50% of the blocks are single-line
+        # print("Skipping header/footer detection for text-dense document.")
+        return result_dict
+
+    # Collect the bounding boxes of all blocks
+    all_bboxes = []
+    all_texts = []
+
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    all_bboxes.append(block["bbox"])
+
+    # Get the height of the page
+    page_height = max(bbox[3] for bbox in all_bboxes)
+
+    # Get the most common bbox lists for headers and footers
+    common_header_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="top") if all_bboxes else []
+    common_footer_bboxes = get_most_common_bboxes(all_bboxes, page_height, position="bottom") if all_bboxes else []
+
+    # Detect and mark headers and footers
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_key, block in blocks.items():
+                if block_key.startswith("block_"):
+                    bbox = block["bbox"]
+                    text = block["text"]
+
+                    is_header = compare_bbox_with_list(bbox, common_header_bboxes)
+                    is_footer = compare_bbox_with_list(bbox, common_footer_bboxes)
+                    block["is_header"] = int(is_header)
+                    block["is_footer"] = int(is_footer)
+
+    return result_dict
+
+
+def __get_page_size(page_sizes:list):
+    """
+    页面大小可能不一样
+    """
+    w = sum([w for w,h in page_sizes])/len(page_sizes)
+    h = sum([h for w,h  in page_sizes])/len(page_sizes)
+    return w, h
+
+def __calculate_iou(bbox1, bbox2):
+    iou = calculate_iou(bbox1, bbox2)
+    return iou
+
+def __is_same_pos(box1, box2, iou_threshold):
+    iou = __calculate_iou(box1, box2)
+    return iou >= iou_threshold
+
+
+def get_most_common_bbox(bboxes:list, page_size:list, page_cnt:int,  page_range_threshold=0.2, iou_threshold=0.9):
+    """
+    common bbox必须大于page_cnt的1/3
+    """
+    min_occurance_cnt = max(3, page_cnt//4)
+    header_det_bbox = []
+    footer_det_bbox = []
+    
+    hdr_same_pos_group = []
+    btn_same_pos_group = []
+    
+    page_w, page_h = __get_page_size(page_size)
+    top_y, bottom_y = page_w*page_range_threshold, page_h*(1-page_range_threshold)
+    
+    top_bbox = [b for b in bboxes if b[3]<top_y]
+    bottom_bbox = [b for b in bboxes if b[1]>bottom_y]
+    # 然后开始排序，寻找最经常出现的bbox, 寻找的时候如果IOU>iou_threshold就算是一个
+    for i in range(0, len(top_bbox)):
+        hdr_same_pos_group.append([top_bbox[i]])
+        for j in range(i+1, len(top_bbox)):
+            if __is_same_pos(top_bbox[i], top_bbox[j], iou_threshold):
+                #header_det_bbox = [min(top_bbox[i][0], top_bbox[j][0]), min(top_bbox[i][1], top_bbox[j][1]), max(top_bbox[i][2], top_bbox[j][2]), max(top_bbox[i][3],top_bbox[j][3])]
+                hdr_same_pos_group[i].append(top_bbox[j])
+                
+    for i in range(0, len(bottom_bbox)):
+        btn_same_pos_group.append([bottom_bbox[i]])
+        for j in range(i+1, len(bottom_bbox)):
+            if __is_same_pos(bottom_bbox[i], bottom_bbox[j], iou_threshold):
+                #footer_det_bbox = [min(bottom_bbox[i][0], bottom_bbox[j][0]), min(bottom_bbox[i][1], bottom_bbox[j][1]), max(bottom_bbox[i][2], bottom_bbox[j][2]), max(bottom_bbox[i][3],bottom_bbox[j][3])]
+                btn_same_pos_group[i].append(bottom_bbox[j])
+                
+    # 然后看下每一组的bbox，是否符合大于page_cnt一定比例
+    hdr_same_pos_group = [g for g in hdr_same_pos_group if len(g)>=min_occurance_cnt]
+    btn_same_pos_group = [g for g in btn_same_pos_group if len(g)>=min_occurance_cnt]
+    
+    # 平铺2个list[list]
+    hdr_same_pos_group = [bbox for g in hdr_same_pos_group for bbox in g]
+    btn_same_pos_group = [bbox for g in btn_same_pos_group for bbox in g]
+    # 寻找hdr_same_pos_group中的box[3]最大值，btn_same_pos_group中的box[1]最小值
+    hdr_same_pos_group.sort(key=lambda b:b[3])
+    btn_same_pos_group.sort(key=lambda b:b[1])
+    
+    hdr_y = hdr_same_pos_group[-1][3] if hdr_same_pos_group else 0
+    btn_y = btn_same_pos_group[0][1] if btn_same_pos_group else page_h
+    
+    header_det_bbox = [0, 0, page_w, hdr_y]
+    footer_det_bbox = [0, btn_y, page_w, page_h]
+    # logger.warning(f"header: {header_det_bbox}, footer: {footer_det_bbox}")
+    return header_det_bbox, footer_det_bbox, page_w, page_h
+    
+
+def drop_footer_header(pdf_info_dict:dict):
+    """
+    启用规则探测,在全局的视角上通过统计的方法。
+    """
+    header = []
+    footer = []
+    
+    all_text_bboxes = [blk['bbox'] for _, val in pdf_info_dict.items() for blk in val['preproc_blocks']]
+    image_bboxes = [img['bbox'] for _, val in pdf_info_dict.items() for img in val['images']] + [img['bbox'] for _, val in pdf_info_dict.items() for img in val['image_backup']]
+    page_size = [val['page_size'] for _, val in pdf_info_dict.items()]
+    page_cnt = len(pdf_info_dict.keys()) # 一共多少页
+    header, footer, page_w, page_h = get_most_common_bbox(all_text_bboxes+image_bboxes, page_size, page_cnt)
+    
+    """"
+    把范围扩展到页面水平的整个方向上
+    """        
+    if header:
+        header = [0, 0, page_w, header[3]+1]
+        
+    if footer:
+        footer = [0, footer[1]-1, page_w, page_h]
+        
+    # 找到footer, header范围之后，针对每一页pdf，从text、图片中删除这些范围内的内容
+    # 移除text block
+    
+    for _, page_info in pdf_info_dict.items():
+        header_text_blk = []
+        footer_text_blk = []
+        for blk in page_info['preproc_blocks']:
+            blk_bbox = blk['bbox']
+            if header and blk_bbox[3]<=header[3]:
+                blk['tag'] = "header"
+                header_text_blk.append(blk)
+            elif footer and blk_bbox[1]>=footer[1]:
+                blk['tag'] = "footer"
+                footer_text_blk.append(blk)
+                
+        # 放入text_block_droped中
+        page_info['droped_text_block'].extend(header_text_blk)
+        page_info['droped_text_block'].extend(footer_text_blk)
+        
+        for blk in header_text_blk:
+            page_info['preproc_blocks'].remove(blk)
+        for blk in footer_text_blk:
+            page_info['preproc_blocks'].remove(blk)
+            
+        """接下来把footer、header上的图片也删除掉。图片包括正常的和backup的"""
+        header_image = []
+        footer_image = []
+        
+        for image_info in page_info['images']:
+            img_bbox = image_info['bbox']
+            if header and img_bbox[3]<=header[3]:
+                image_info['tag'] = "header"
+                header_image.append(image_info)
+            elif footer and img_bbox[1]>=footer[1]:
+                image_info['tag'] = "footer"
+                footer_image.append(image_info)
+                
+        page_info['droped_image_block'].extend(header_image)
+        page_info['droped_image_block'].extend(footer_image)
+        
+        for img in header_image:
+            page_info['images'].remove(img)
+        for img in footer_image:
+            page_info['images'].remove(img)
+            
+        """接下来吧backup的图片也删除掉"""
+        header_image = []
+        footer_image = []
+        
+        for image_info in page_info['image_backup']:
+            img_bbox = image_info['bbox']
+            if header and img_bbox[3]<=header[3]:
+                image_info['tag'] = "header"
+                header_image.append(image_info)
+            elif footer and img_bbox[1]>=footer[1]:
+                image_info['tag'] = "footer"
+                footer_image.append(image_info)
+                
+        page_info['droped_image_block'].extend(header_image)
+        page_info['droped_image_block'].extend(footer_image)
+        
+        for img in header_image:
+            page_info['image_backup'].remove(img)
+        for img in footer_image:
+            page_info['image_backup'].remove(img)
+            
+    return header, footer
diff --git a/magic_pdf/pre_proc/detect_footnote.py b/magic_pdf/pre_proc/detect_footnote.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f903c85582dd37b6ec8a3efc4165f39eaac58ee
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_footnote.py
@@ -0,0 +1,170 @@
+from collections import Counter
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path=None, debug_mode=False):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 footnote ---------#
+    footnote_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # if xf['category_id'] == 5 and xf['score'] >= 0.3:
+        if xf['category_id'] == 5 and xf['score'] >= 0.43:  # 新的footnote阈值
+            footnote_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    footnote_final_names = []
+    footnote_final_bboxs = []
+    footnote_ID = 0
+    for L, U, R, D in footnote_bbox_from_DocXChain:
+        if debug_mode:
+            # cur_footnote = page.get_pixmap(clip=(L,U,R,D))
+            new_footnote_name = "footnote_{}_{}.png".format(page_ID, footnote_ID)    # 脚注name
+            # cur_footnote.save(md_bookname_save_path + '/' + new_footnote_name)           # 把脚注存储在新建的文件夹，并命名
+            footnote_final_names.append(new_footnote_name)                        # 把脚注的名字存在list中
+        footnote_final_bboxs.append((L, U, R, D))
+        footnote_ID += 1
+        
+
+    footnote_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_footnote_bboxs = footnote_final_bboxs
+    return curPage_all_footnote_bboxs
+
+
+def need_remove(block):
+    if 'lines' in block and len(block['lines']) > 0:
+        # block中只有一行，且该行文本全是大写字母，或字体为粗体bold关键词，SB关键词，把这个block捞回来
+        if len(block['lines']) == 1:
+            if 'spans' in block['lines'][0] and len(block['lines'][0]['spans']) == 1:
+                font_keywords = ['SB', 'bold', 'Bold']
+                if block['lines'][0]['spans'][0]['text'].isupper() or any(keyword in block['lines'][0]['spans'][0]['font'] for keyword in font_keywords):
+                    return True
+        for line in block['lines']:
+            if 'spans' in line and len(line['spans']) > 0:
+                for span in line['spans']:
+                    # 检测"keyword"是否在span中，忽略大小写
+                    if "keyword" in span['text'].lower():
+                        return True
+    return False
+
+def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_font):
+    """
+    根据给定的文本块、页高和页码，解析出符合规则的脚注文本块，并返回其边界框。
+
+    Args:
+        remain_text_blocks (list): 包含所有待处理的文本块的列表。
+        page_height (float): 页面的高度。
+        page_id (int): 页面的ID。
+
+    Returns:
+        list: 符合规则的脚注文本块的边界框列表。
+
+    """
+    # if page_id > 20:
+    if page_id > 2:  # 为保证精确度，先只筛选前3页
+        return []
+    else:
+        # 存储每一行的文本块大小的列表
+        line_sizes = []
+        # 存储每个文本块的平均行大小
+        block_sizes = []
+        # 存储每一行的字体信息
+        # font_names = []
+        font_names = Counter()
+        if len(remain_text_blocks) > 0:
+            for block in remain_text_blocks:
+                block_line_sizes = []
+                # block_fonts = []
+                block_fonts = Counter()
+                for line in block['lines']:
+                    # 提取每个span的size属性，并计算行大小
+                    span_sizes = [span['size'] for span in line['spans'] if 'size' in span]
+                    if span_sizes:
+                        line_size = sum(span_sizes) / len(span_sizes)
+                        line_sizes.append(line_size)
+                        block_line_sizes.append(line_size)
+                    span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
+                    if span_font:
+                        #  main_text_font应该用基于字数最多的字体而不是span级别的统计
+                        # font_names.append(font_name for font_name in span_font)
+                        # block_fonts.append(font_name for font_name in span_font)
+                        for font, count in span_font:
+                            # font_names.extend([font] * count)
+                            # block_fonts.extend([font] * count)
+                            font_names[font] += count
+                            block_fonts[font] += count
+                if block_line_sizes:
+                    # 计算文本块的平均行大小
+                    block_size = sum(block_line_sizes) / len(block_line_sizes)
+                    # block_font = collections.Counter(block_fonts).most_common(1)[0][0]
+                    block_font = block_fonts.most_common(1)[0][0]
+                    block_sizes.append((block, block_size, block_font))
+
+            # 计算main_text_size
+            main_text_size = Counter(line_sizes).most_common(1)[0][0]
+            # 计算main_text_font
+            # main_text_font = collections.Counter(font_names).most_common(1)[0][0]
+            # main_text_font = font_names.most_common(1)[0][0]
+            # 删除一些可能被误识别为脚注的文本块
+            block_sizes = [(block, block_size, block_font) for block, block_size, block_font in block_sizes if not need_remove(block)]
+
+            # 检测footnote_block 并返回 footnote_bboxes
+            # footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
+            #                    block['bbox'][1] > page_height * 0.6 and block_size < main_text_size
+            #                    and (len(block['lines']) < 5 or block_font != main_text_font)]
+                               # and len(block['lines']) < 5]
+            footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
+                               block['bbox'][1] > page_height * 0.6 and
+                               #  较为严格的规则
+                               block_size < main_text_size and
+                               (len(block['lines']) < 5 or
+                                block_font != main_text_font)]
+
+                               #  较为宽松的规则
+                               # sum([block_size < main_text_size,
+                               #      len(block['lines']) < 5,
+                               #      block_font != main_text_font])
+                               # >= 2]
+
+
+            return footnote_bboxes
+        else:
+            return []
+
+
+
diff --git a/magic_pdf/pre_proc/detect_header.py b/magic_pdf/pre_proc/detect_header.py
new file mode 100644
index 0000000000000000000000000000000000000000..670eccd3db39983dd730155b4df14a664d690317
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_header.py
@@ -0,0 +1,64 @@
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 header ---------#
+    header_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 3 and xf['score'] >= 0.3:
+            header_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    header_final_names = []
+    header_final_bboxs = []
+    header_ID = 0
+    for L, U, R, D in header_bbox_from_DocXChain:
+        # cur_header = page.get_pixmap(clip=(L,U,R,D))
+        new_header_name = "header_{}_{}.png".format(page_ID, header_ID)    # 页眉name
+        # cur_header.save(res_dir_path + '/' + new_header_name)           # 把页眉存储在新建的文件夹，并命名
+        header_final_names.append(new_header_name)                        # 把页面的名字存在list中
+        header_final_bboxs.append((L, U, R, D))
+        header_ID += 1
+        
+
+    header_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_header_bboxs = header_final_bboxs
+    return curPage_all_header_bboxs
+
diff --git a/magic_pdf/pre_proc/detect_images.py b/magic_pdf/pre_proc/detect_images.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe72f60c3cc3241d764adb0df886cf218c8dd0fc
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_images.py
@@ -0,0 +1,647 @@
+import collections      # 统计库
+import re
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+
+
+#--------------------------------------- Tool Functions --------------------------------------#
+# 正则化，输入文本，输出只保留a-z,A-Z,0-9
+def remove_special_chars(s: str) -> str:
+    pattern = r"[^a-zA-Z0-9]"
+    res = re.sub(pattern, "", s)
+    return res
+
+def check_rect1_sameWith_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1和rect2是否一模一样
+    return L1 == L2 and U1 == U2 and R1 == R2 and D1 == D2
+
+def check_rect1_contains_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1包含了rect2
+    return (L1 <= L2 <= R2 <= R1) and (U1 <= U2 <= D2 <= D1)
+
+def check_rect1_overlaps_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> bool:
+    # 判断rect1与rect2是否存在重叠（只有一条边重叠，也算重叠）
+    return max(L1, L2) <= min(R1, R2) and max(U1, U2) <= min(D1, D2)
+
+def calculate_overlapRatio_between_rect1_and_rect2(L1: float, U1: float, R1: float, D1: float, L2: float, U2: float, R2: float, D2: float) -> (float, float):
+    # 计算两个rect，重叠面积各占2个rect面积的比例
+    if min(R1, R2) < max(L1, L2) or min(D1, D2) < max(U1, U2):
+        return 0, 0
+    square_1 = (R1 - L1) * (D1 - U1)
+    square_2 = (R2 - L2) * (D2 - U2)
+    if square_1 == 0 or square_2 == 0:
+        return 0, 0
+    square_overlap = (min(R1, R2) - max(L1, L2)) * (min(D1, D2) - max(U1, U2))
+    return square_overlap / square_1, square_overlap / square_2
+
+def calculate_overlapRatio_between_line1_and_line2(L1: float, R1: float, L2: float, R2: float) -> (float, float):
+    # 计算两个line，重叠区间各占2个line长度的比例
+    if max(L1, L2) > min(R1, R2):
+        return 0, 0
+    if L1 == R1 or L2 == R2:
+        return 0, 0
+    overlap_line = min(R1, R2) - max(L1, L2)
+    return overlap_line / (R1 - L1), overlap_line / (R2 - L2)
+
+
+# 判断rect其实是一条line
+def check_rect_isLine(L: float, U: float, R: float, D: float) -> bool:
+    width = R - L
+    height = D - U
+    if width <= 3 or height <= 3:
+        return True
+    if width / height >= 30 or height / width >= 30:
+        return True
+
+
+
+def parse_images(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, junk_img_bojids=[]):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    #### 通过fitz获取page信息
+    ## 超越边界
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    
+    #----------------- 保存每一个文本块的LURD ------------------#
+    textLine_blocks = []
+    blocks = page.get_text(
+            "dict",
+            flags=fitz.TEXTFLAGS_TEXT,
+            #clip=clip,
+        )["blocks"]
+    for i in range(len(blocks)):
+        bbox = blocks[i]['bbox']
+        # print(bbox)
+        for tt in blocks[i]['lines']:
+            # 当前line
+            cur_line_bbox = None                            # 当前line，最右侧的section的bbox
+            for xf in tt['spans']:
+                L, U, R, D = xf['bbox']
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                textLine_blocks.append((L, U, R, D))
+    textLine_blocks.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    
+
+    #---------------------------------------------- 保存img --------------------------------------------------#
+    raw_imgs = page.get_images()                    # 获取所有的图片
+    imgs = []
+    img_names = []                              # 保存图片的名字，方便在md中插入引用
+    img_bboxs = []                              # 保存图片的location信息。
+    img_visited = [] # 记忆化，记录该图片是否在md中已经插入过了
+    img_ID = 0
+
+    ## 获取、保存每张img的location信息(x1, y1, x2, y2， UL, DR坐标)
+    for i in range(len(raw_imgs)):
+        # 如果图片在junklist中则跳过
+        if raw_imgs[i][0] in junk_img_bojids:
+            continue
+        else:
+            try:
+                tt = page.get_image_rects(raw_imgs[i][0], transform = True)
+
+                rec = tt[0][0]
+                L, U, R, D = int(rec[0]), int(rec[1]), int(rec[2]), int(rec[3])
+
+                L, R = min(L, R), max(L, R)
+                U, D = min(U, D), max(U, D)
+                if not(pageL <= L < R <= pageR and pageU <= U < D <= pageD):
+                    continue
+                if pageL == L and R == pageR:
+                    continue
+                if pageU == U and D == pageD:
+                    continue
+                # pix1 = page.get_Pixmap(clip=(L,U,R,D))
+                new_img_name = "{}_{}.png".format(page_ID, i)      # 图片name
+                # pix1.save(res_dir_path + '/' + new_img_name)        # 把图片存出在新建的文件夹，并命名
+                img_names.append(new_img_name)
+                img_bboxs.append((L, U, R, D))
+                img_visited.append(False)
+                imgs.append(raw_imgs[i])
+            except:
+                continue
+    
+    #-------- 如果img之间有重叠。说明获取的img大小有问题，位置也不一定对。就扔掉--------#
+    imgs_ok = [True for _ in range(len(imgs))]
+    for i in range(len(imgs)):
+        L1, U1, R1, D1 = img_bboxs[i]
+        for j in range(i + 1, len(imgs)):
+            L2, U2, R2, D2 = img_bboxs[j]
+            ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            if ratio_1 > 0 and ratio_2 > 0:
+                if ratio_1 == 1 and ratio_2 > 0.8:
+                    imgs_ok[i] = False
+                elif ratio_1 > 0.8 and ratio_2 == 1:
+                    imgs_ok[j] = False 
+                elif s1 > 20000 and s2 > 20000 and ratio_1 > 0.4 and ratio_2 > 0.4:
+                    imgs_ok[i] = False
+                    imgs_ok[j] = False
+                elif s1 / s2 > 5 and ratio_2 > 0.5:
+                    imgs_ok[j] = False
+                elif s2 / s1 > 5 and ratio_1 > 0.5:
+                    imgs_ok[i] = False
+                    
+    imgs = [imgs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_names = [img_names[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_bboxs = [img_bboxs[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    img_visited = [img_visited[i] for i in range(len(imgs)) if imgs_ok[i] == True]
+    #*******************************************************************************#
+    
+    #---------------------------------------- 通过fitz提取svg的信息 -----------------------------------------#
+    #
+    svgs = page.get_drawings()
+    #------------ preprocess, check一些大框，看是否是合理的 ----------#
+    ## 去重。有时候会遇到rect1和rect2是完全一样的情形。
+    svg_rect_visited = set()
+    available_svgIdx = []
+    for i in range(len(svgs)):
+        L, U, R, D = svgs[i]['rect'].irect
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        tt = (L, U, R, D)
+        if tt not in svg_rect_visited:
+            svg_rect_visited.add(tt)
+            available_svgIdx.append(i)
+        
+    svgs = [svgs[i] for i in available_svgIdx]                  # 去重后，有效的svgs
+    svg_childs = [[] for _ in range(len(svgs))]
+    svg_parents = [[] for _ in range(len(svgs))]
+    svg_overlaps = [[] for _ in range(len(svgs))]            #svg_overlaps[i]是一个list，存的是与svg_i有重叠的svg的index。e.g., svg_overlaps[0] = [1, 2, 7, 9]
+    svg_visited = [False for _ in range(len(svgs))]
+    svg_exceedPage = [0 for _ in range(len(svgs))]       # 是否超越边界（artbox），很大，但一般是一个svg的底。  
+        
+    
+    for i in range(len(svgs)):
+        L, U, R, D = svgs[i]['rect'].irect
+        ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L, U, R, D, pageL, pageU, pageR, pageD)
+        if (pageL + 20 < L <= R < pageR - 20) and (pageU + 20 < U <= D < pageD - 20):
+            if ratio_2 >= 0.7:
+                svg_exceedPage[i] += 4
+        else:
+            if L <= pageL:
+                svg_exceedPage[i] += 1
+            if pageR <= R:
+                svg_exceedPage[i] += 1
+            if U <= pageU:
+                svg_exceedPage[i] += 1
+            if pageD <= D:
+                svg_exceedPage[i] += 1
+            
+    #### 如果有≥2个的超边界的框，就不要手写规则判断svg了。很难写对。
+    if len([x for x in svg_exceedPage if x >= 1]) >= 2:
+        svgs = []
+        svg_childs = []
+        svg_parents = []
+        svg_overlaps = []
+        svg_visited = []
+        svg_exceedPage = []  
+            
+    #---------------------------- build graph ----------------------------#
+    for i, p in enumerate(svgs):
+        L1, U1, R1, D1 = svgs[i]["rect"].irect
+        for j in range(len(svgs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = svgs[j]["rect"].irect
+            ## 包含
+            if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                svg_childs[i].append(j)
+                svg_parents[j].append(i)
+            else:
+                ## 交叉
+                if check_rect1_overlaps_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                    svg_overlaps[i].append(j)
+
+    #---------------- 确定最终的svg。连通块儿的外围 -------------------#
+    eps_ERROR = 5                      # 给识别出的svg，四周留白（为了防止pyMuPDF的rect不准）
+    svg_ID = 0        
+    svg_final_names = []
+    svg_final_bboxs = []
+    svg_final_visited = []              # 为下面，text识别左准备。作用同img_visited
+    
+    svg_idxs = [i for i in range(len(svgs))]
+    svg_idxs.sort(key = lambda i: -(svgs[i]['rect'].irect[2] - svgs[i]['rect'].irect[0]) * (svgs[i]['rect'].irect[3] - svgs[i]['rect'].irect[1]))   # 按照面积，从大到小排序
+     
+    for i in svg_idxs:
+        if svg_visited[i] == True:
+            continue
+        svg_visited[i] = True
+        L, U, R, D = svgs[i]['rect'].irect
+        width = R - L
+        height = D - U
+        if check_rect_isLine(L, U, R, D) == True:
+            svg_visited[i] = False
+            continue
+        # if i == 4:
+        #     print(i, L, U, R, D)
+        #     print(svg_parents[i])
+        
+        cur_block_element_cnt = 0               # 当前要判定为svg的区域中，有多少elements，最外围的最大svg框除外。
+        if len(svg_parents[i]) == 0:
+            ## 是个普通框的情形
+            cur_block_element_cnt += len(svg_childs[i])
+            if svg_exceedPage[i] == 0:
+                ## 误差。可能已经包含在某个框里面了
+                neglect_flag = False
+                for pL, pU, pR, pD in svg_final_bboxs:
+                    if pL <= L <= R <= pR and pU <= U <= D <= pD:
+                        neglect_flag = True
+                        break
+                if neglect_flag == True:
+                    continue
+                
+                ## 搜索连通域, bfs+记忆化
+                q = collections.deque()
+                for j in svg_overlaps[i]:
+                    q.append(j)
+                while q:
+                    j = q.popleft()
+                    svg_visited[j] = True
+                    L2, U2, R2, D2 = svgs[j]['rect'].irect
+                    # width2 = R2 - L2
+                    # height2 = D2 - U2
+                    # if width2 <= 2 or height2 <= 2 or (height2 / width2) >= 30 or (width2 / height2) >= 30:
+                    #     continue
+                    L = min(L, L2)
+                    R = max(R, R2)
+                    U = min(U, U2)
+                    D = max(D, D2)
+                    cur_block_element_cnt += 1
+                    cur_block_element_cnt += len(svg_childs[j])
+                    for k in svg_overlaps[j]:
+                        if svg_visited[k] == False and svg_exceedPage[k] == 0:
+                            svg_visited[k] = True
+                            q.append(k)
+            elif svg_exceedPage[i] <= 2:
+                ## 误差。可能已经包含在某个svg_final_bbox框里面了
+                neglect_flag = False
+                for sL, sU, sR, sD in svg_final_bboxs:
+                    if sL <= L <= R <= sR and sU <= U <= D <= sD:
+                        neglect_flag = True
+                        break
+                if neglect_flag == True:
+                    continue
+                
+                L, U, R, D = pageR, pageD, pageL, pageU
+                ## 所有孩子元素的最大边界
+                for j in svg_childs[i]:
+                    if svg_visited[j] == True:
+                        continue
+                    if svg_exceedPage[j] >= 1:
+                        continue
+                    svg_visited[j] = True                       #### 这个位置考虑一下
+                    L2, U2, R2, D2 = svgs[j]['rect'].irect
+                    L = min(L, L2)
+                    R = max(R, R2)
+                    U = min(U, U2)
+                    D = max(D, D2)
+                    cur_block_element_cnt += 1
+                    
+            # 如果是条line，就不用保存了
+            if check_rect_isLine(L, U, R, D) == True:
+                continue
+            # 如果当前的svg，连2个elements都没有，就不用保存了
+            if cur_block_element_cnt < 3:
+                continue
+            
+            ## 当前svg，框住了多少文本框。如果框多了，可能就是错了
+            contain_textLineBlock_cnt = 0
+            for L2, U2, R2, D2 in textLine_blocks:
+                if check_rect1_contains_rect2(L, U, R, D, L2, U2, R2, D2) == True:
+                    contain_textLineBlock_cnt += 1
+            if contain_textLineBlock_cnt >= 10:
+                continue
+            
+            # L -= eps_ERROR * 2
+            # U -= eps_ERROR
+            # R += eps_ERROR * 2
+            # D += eps_ERROR
+            # # cur_svg = page.get_pixmap(matrix=fitz.Identity, dpi=None, colorspace=fitz.csRGB, clip=(U,L,R,D), alpha=False, annots=True)
+            # cur_svg = page.get_pixmap(clip=(L,U,R,D))
+            new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID)      # 图片name
+            # cur_svg.save(res_dir_path + '/' + new_svg_name)        # 把图片存出在新建的文件夹，并命名
+            svg_final_names.append(new_svg_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            svg_final_bboxs.append((L, U, R, D))
+            svg_final_visited.append(False)
+            svg_ID += 1
+    
+    ## 识别出的svg，可能有 包含，相邻的情形。需要进一步合并
+    svg_idxs = [i for i in range(len(svg_final_bboxs))]
+    svg_idxs.sort(key = lambda i: (svg_final_bboxs[i][1], svg_final_bboxs[i][0]))   # (U, L)
+    svg_final_names_2 = []
+    svg_final_bboxs_2 = []
+    svg_final_visited_2 = []              # 为下面，text识别左准备。作用同img_visited
+    svg_ID_2 = 0
+    for i in range(len(svg_final_bboxs)):
+        L1, U1, R1, D1 = svg_final_bboxs[i]
+        for j in range(i + 1, len(svg_final_bboxs)):
+            L2, U2, R2, D2 = svg_final_bboxs[j]
+            # 如果 rect1包含了rect2
+            if check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                svg_final_visited[j] = True
+                continue
+            # 水平并列
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(U1, D1, U2, D2)
+            if ratio_1 >= 0.7 and ratio_2 >= 0.7:
+                if abs(L2 - R1) >= 20:
+                    continue
+                LL = min(L1, L2)
+                UU = min(U1, U2)
+                RR = max(R1, R2)
+                DD = max(D1, D2)
+                svg_final_bboxs[i] = (LL, UU, RR, DD)
+                svg_final_visited[j] = True
+                continue
+            # 竖直并列
+            ratio_1, ratio_2 = calculate_overlapRatio_between_line1_and_line2(L1, R2, L2, R2)
+            if ratio_1 >= 0.7 and ratio_2 >= 0.7:
+                if abs(U2 - D1) >= 20:
+                    continue
+                LL = min(L1, L2)
+                UU = min(U1, U2)
+                RR = max(R1, R2)
+                DD = max(D1, D2)
+                svg_final_bboxs[i] = (LL, UU, RR, DD)
+                svg_final_visited[j] = True
+    
+    for i in range(len(svg_final_bboxs)):
+        if svg_final_visited[i] == False:
+            L, U, R, D = svg_final_bboxs[i]
+            svg_final_bboxs_2.append((L, U, R, D))
+            
+            L -= eps_ERROR * 2
+            U -= eps_ERROR
+            R += eps_ERROR * 2
+            D += eps_ERROR
+            # cur_svg = page.get_pixmap(clip=(L,U,R,D))
+            new_svg_name = "svg_{}_{}.png".format(page_ID, svg_ID_2)      # 图片name
+            # cur_svg.save(res_dir_path + '/' + new_svg_name)        # 把图片存出在新建的文件夹，并命名
+            svg_final_names_2.append(new_svg_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            svg_final_bboxs_2.append((L, U, R, D))
+            svg_final_visited_2.append(False)
+            svg_ID_2 += 1
+       
+    ## svg收尾。识别为drawing，但是在上面没有拼成一张图的。
+    # 有收尾才comprehensive
+    # xxxx
+    # xxxx
+    # xxxx
+    # xxxx
+    
+    
+    #--------- 通过json_from_DocXchain来获取，figure, table, equation的bbox ---------#
+    figure_bbox_from_DocXChain = []
+    
+    figure_from_DocXChain_visited = []          # 记忆化
+    figure_bbox_from_DocXChain_overlappedRatio = []
+    
+    figure_only_from_DocXChain_bboxs = []     # 存储
+    figure_only_from_DocXChain_names = []
+    figure_only_from_DocXChain_visited = []
+    figure_only_ID = 0
+    
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+    
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        # figure
+        if xf["category_id"] == 1 and xf['score'] >= 0.3:
+            figure_bbox_from_DocXChain.append((L, U, R, D))
+            figure_from_DocXChain_visited.append(False)
+            figure_bbox_from_DocXChain_overlappedRatio.append(0.0)
+
+    #---------------------- 比对上面识别出来的img,svg 与DocXChain给的figure -----------------------#
+    
+    ## 比对imgs
+    for i, b1 in enumerate(figure_bbox_from_DocXChain):
+        # print('--------- DocXChain的图片', b1)
+        L1, U1, R1, D1 = b1
+        for b2 in img_bboxs:
+            # print('-------- igms得到的图', b2)
+            L2, U2, R2, D2 = b2
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            # 相同
+            if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            # 包含
+            elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                if s2 / s1 > 0.8:
+                    figure_from_DocXChain_visited[i] = True
+            elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                if s1 / s2 > 0.8:
+                    figure_from_DocXChain_visited[i] = True 
+            else:
+                # 重叠了相当一部分
+                # print('进入第3部分')
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if (ratio_1 >= 0.6 and ratio_2 >= 0.6) or (ratio_1 >= 0.8 and s1/s2>0.8) or (ratio_2 >= 0.8 and s2/s1>0.8):
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
+                    # print('图片的重叠率是{}'.format(ratio_1))
+
+
+    ## 比对svgs
+    svg_final_bboxs_2_badIdxs = []
+    for i, b1 in enumerate(figure_bbox_from_DocXChain):
+        L1, U1, R1, D1 = b1
+        for j, b2 in enumerate(svg_final_bboxs_2):
+            L2, U2, R2, D2 = b2
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            # 相同
+            if check_rect1_sameWith_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            # 包含
+            elif check_rect1_contains_rect2(L1, U1, R1, D1, L2, U2, R2, D2) == True:
+                figure_from_DocXChain_visited[i] = True
+            elif check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                if s1 / s2 > 0.7:
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    svg_final_bboxs_2_badIdxs.append(j)     # svg丢弃。用DocXChain的结果。
+            else:
+                # 重叠了相当一部分
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if (ratio_1 >= 0.5 and ratio_2 >= 0.5) or (min(ratio_1, ratio_2) >= 0.4 and max(ratio_1, ratio_2) >= 0.6):
+                    figure_from_DocXChain_visited[i] = True
+                else:
+                    figure_bbox_from_DocXChain_overlappedRatio[i] += ratio_1
+                    
+    # 丢掉错误的svg
+    svg_final_bboxs_2 = [svg_final_bboxs_2[i] for i in range(len(svg_final_bboxs_2)) if i not in set(svg_final_bboxs_2_badIdxs)]
+    
+    for i in range(len(figure_from_DocXChain_visited)):
+        if figure_bbox_from_DocXChain_overlappedRatio[i] >= 0.7:
+            figure_from_DocXChain_visited[i] = True
+    
+    # DocXChain识别出来的figure，但是没被保存的。
+    for i in range(len(figure_from_DocXChain_visited)):
+        if figure_from_DocXChain_visited[i] == False:
+            figure_from_DocXChain_visited[i] = True
+            cur_bbox = figure_bbox_from_DocXChain[i]
+            # cur_figure = page.get_pixmap(clip=cur_bbox)
+            new_figure_name = "figure_only_{}_{}.png".format(page_ID, figure_only_ID)      # 图片name
+            # cur_figure.save(res_dir_path + '/' + new_figure_name)        # 把图片存出在新建的文件夹，并命名
+            figure_only_from_DocXChain_names.append(new_figure_name)                      # 把图片的名字存在list中，方便在md中插入引用
+            figure_only_from_DocXChain_bboxs.append(cur_bbox)
+            figure_only_from_DocXChain_visited.append(False)
+            figure_only_ID += 1
+    
+    img_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    svg_final_bboxs_2.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    figure_only_from_DocXChain_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_fig_bboxs = img_bboxs + svg_final_bboxs + figure_only_from_DocXChain_bboxs
+    
+    #--------------------------- 最后统一去重 -----------------------------------#
+    curPage_all_fig_bboxs.sort(key = lambda LURD: ( (LURD[2]-LURD[0])*(LURD[3]-LURD[1]) , LURD[0], LURD[1]) )
+    
+    #### 先考虑包含关系的小块
+    final_duplicate = set()
+    for i in range(len(curPage_all_fig_bboxs)):
+        L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
+        for j in range(len(curPage_all_fig_bboxs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            if check_rect1_contains_rect2(L2, U2, R2, D2, L1, U1, R1, D1) == True:
+                final_duplicate.add((L1, U1, R1, D1))
+            else:
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                if ratio_1 >= 0.8 and ratio_2 <= 0.6:
+                    final_duplicate.add((L1, U1, R1, D1))
+
+    curPage_all_fig_bboxs = [LURD for LURD in curPage_all_fig_bboxs if LURD not in final_duplicate]
+    
+    #### 再考虑重叠关系的块
+    final_duplicate = set()
+    final_synthetic_bboxs = []
+    for i in range(len(curPage_all_fig_bboxs)):
+        L1, U1, R1, D1 = curPage_all_fig_bboxs[i]
+        for j in range(len(curPage_all_fig_bboxs)):
+            if i == j:
+                continue
+            L2, U2, R2, D2 = curPage_all_fig_bboxs[j]
+            s1 = abs(R1 - L1) * abs(D1 - U1)
+            s2 = abs(R2 - L2) * abs(D2 - U2)
+            ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+            union_ok = False
+            if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+                union_ok = True
+            if (ratio_1 > 0.2 and s2 / s1 > 5):
+                union_ok = True
+            if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+                union_ok = True
+            if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+                union_ok = True
+            if union_ok == True:
+                final_duplicate.add((L1, U1, R1, D1))
+                final_duplicate.add((L2, U2, R2, D2))
+                L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
+                final_synthetic_bboxs.append((L3, U3, R3, D3))
+
+    # print('---------- curPage_all_fig_bboxs ---------')
+    # print(curPage_all_fig_bboxs)
+    curPage_all_fig_bboxs = [b for b in curPage_all_fig_bboxs if b not in final_duplicate]    
+    final_synthetic_bboxs = list(set(final_synthetic_bboxs))
+
+
+    ## 再再考虑重叠关系。极端情况下会迭代式地2进1
+    new_images = []
+    droped_img_idx = []
+    image_bboxes = [[b[0], b[1], b[2], b[3]] for b in final_synthetic_bboxs]        
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx:
+                L2, U2, R2, D2 = image_bboxes[j]
+                s1 = abs(R1 - L1) * abs(D1 - U1)
+                s2 = abs(R2 - L2) * abs(D2 - U2)
+                ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+                union_ok = False
+                if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+                    union_ok = True
+                if (ratio_1 > 0.2 and s2 / s1 > 5):
+                    union_ok = True
+                if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+                    union_ok = True
+                if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+                    union_ok = True
+                if union_ok == True:
+                    # 合并
+                    image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                    droped_img_idx.append(j)
+            
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    
+    
+    # find_union_FLAG = True
+    # while find_union_FLAG == True:
+    #     find_union_FLAG = False
+    #     final_duplicate = set()
+    #     tmp = []
+    #     for i in range(len(final_synthetic_bboxs)):
+    #         L1, U1, R1, D1 = final_synthetic_bboxs[i]
+    #         for j in range(len(final_synthetic_bboxs)):
+    #             if i == j:
+    #                 continue
+    #             L2, U2, R2, D2 = final_synthetic_bboxs[j]
+    #             s1 = abs(R1 - L1) * abs(D1 - U1)
+    #             s2 = abs(R2 - L2) * abs(D2 - U2)
+    #             ratio_1, ratio_2 = calculate_overlapRatio_between_rect1_and_rect2(L1, U1, R1, D1, L2, U2, R2, D2)
+    #             union_ok = False
+    #             if (ratio_1 >= 0.8 and ratio_2 <= 0.6) or (ratio_1 > 0.6 and ratio_2 > 0.6): 
+    #                 union_ok = True
+    #             if (ratio_1 > 0.2 and s2 / s1 > 5):
+    #                 union_ok = True
+    #             if (L1 <= (L2+R2)/2 <= R1) and (U1 <= (U2+D2)/2 <= D1):
+    #                 union_ok = True
+    #             if (L2 <= (L1+R1)/2 <= R2) and (U2 <= (U1+D1)/2 <= D2):
+    #                 union_ok = True
+    #             if union_ok == True:
+    #                 find_union_FLAG = True
+    #                 final_duplicate.add((L1, U1, R1, D1))
+    #                 final_duplicate.add((L2, U2, R2, D2))
+    #                 L3, U3, R3, D3 = min(L1, L2), min(U1, U2), max(R1, R2), max(D1, D2)
+    #                 tmp.append((L3, U3, R3, D3)) 
+    #     if find_union_FLAG == True:
+    #         tmp = list(set(tmp))
+    #         final_synthetic_bboxs = tmp[:]
+    
+
+    # curPage_all_fig_bboxs += final_synthetic_bboxs
+    # print('--------- final synthetic')
+    # print(final_synthetic_bboxs)
+    #**************************************************************************#
+    images1 = [[img[0], img[1], img[2], img[3]] for img in curPage_all_fig_bboxs]
+    images = images1 + new_images
+    return images
+
diff --git a/magic_pdf/pre_proc/detect_page_number.py b/magic_pdf/pre_proc/detect_page_number.py
new file mode 100644
index 0000000000000000000000000000000000000000..35920a99b00222cc93e9566ef131a6553a3cd38b
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_page_number.py
@@ -0,0 +1,64 @@
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+
+    #--------- 通过json_from_DocXchain来获取 pageNo ---------#
+    pageNo_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
+
+    # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    for xf in xf_json['layout_dets']:
+        L = xf['poly'][0] / horizontal_scale_ratio
+        U = xf['poly'][1] / vertical_scale_ratio
+        R = xf['poly'][2] / horizontal_scale_ratio
+        D = xf['poly'][5] / vertical_scale_ratio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 4 and xf['score'] >= 0.3:
+            pageNo_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    pageNo_final_names = []
+    pageNo_final_bboxs = []
+    pageNo_ID = 0
+    for L, U, R, D in pageNo_bbox_from_DocXChain:
+        # cur_pageNo = page.get_pixmap(clip=(L,U,R,D))
+        new_pageNo_name = "pageNo_{}_{}.png".format(page_ID, pageNo_ID)    # 页码name
+        # cur_pageNo.save(res_dir_path + '/' + new_pageNo_name)           # 把页码存储在新建的文件夹，并命名
+        pageNo_final_names.append(new_pageNo_name)                        # 把页码的名字存在list中
+        pageNo_final_bboxs.append((L, U, R, D))
+        pageNo_ID += 1
+        
+
+    pageNo_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_pageNo_bboxs = pageNo_final_bboxs
+    return curPage_all_pageNo_bboxs
+
diff --git a/magic_pdf/pre_proc/detect_tables.py b/magic_pdf/pre_proc/detect_tables.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc2992eecbf64663992d80f95a8d7785ca710ded
--- /dev/null
+++ b/magic_pdf/pre_proc/detect_tables.py
@@ -0,0 +1,62 @@
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+
+
+def parse_tables(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
+    """
+    :param page_ID: int类型，当前page在当前pdf文档中是第page_D页。
+    :param page :fitz读取的当前页的内容
+    :param res_dir_path: str类型，是每一个pdf文档，在当前.py文件的目录下生成一个与pdf文档同名的文件夹，res_dir_path就是文件夹的dir
+    :param json_from_DocXchain_obj: dict类型，把pdf文档送入DocXChain模型中后，提取bbox，结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
+    """
+    DPI = 72  # use this resolution
+    pix = page.get_pixmap(dpi=DPI)
+    pageL = 0
+    pageR = int(pix.w)
+    pageU = 0
+    pageD = int(pix.h)
+    
+
+    #--------- 通过json_from_DocXchain来获取 table ---------#
+    table_bbox_from_DocXChain = []
+
+    xf_json = json_from_DocXchain_obj
+    width_from_json = xf_json['page_info']['width']
+    height_from_json = xf_json['page_info']['height']
+    LR_scaleRatio = width_from_json / (pageR - pageL)
+    UD_scaleRatio = height_from_json / (pageD - pageU)
+
+    
+    for xf in xf_json['layout_dets']:
+    # {0: 'title', 1: 'figure', 2: 'plain text', 3: 'header', 4: 'page number', 5: 'footnote', 6: 'footer', 7: 'table', 8: 'table caption', 9: 'figure caption', 10: 'equation', 11: 'full column', 12: 'sub column'}
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+        L = xf['poly'][0] / LR_scaleRatio
+        U = xf['poly'][1] / UD_scaleRatio
+        R = xf['poly'][2] / LR_scaleRatio
+        D = xf['poly'][5] / UD_scaleRatio
+        # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+        # R += pageL
+        # U += pageU
+        # D += pageU
+        L, R = min(L, R), max(L, R)
+        U, D = min(U, D), max(U, D)
+        if xf['category_id'] == 7 and xf['score'] >= 0.3:
+            table_bbox_from_DocXChain.append((L, U, R, D))
+            
+    
+    table_final_names = []
+    table_final_bboxs = []
+    table_ID = 0
+    for L, U, R, D in table_bbox_from_DocXChain:
+        # cur_table = page.get_pixmap(clip=(L,U,R,D))
+        new_table_name = "table_{}_{}.png".format(page_ID, table_ID)      # 表格name
+        # cur_table.save(res_dir_path + '/' + new_table_name)        # 把表格存出在新建的文件夹，并命名
+        table_final_names.append(new_table_name)                      # 把表格的名字存在list中，方便在md中插入引用
+        table_final_bboxs.append((L, U, R, D))
+        table_ID += 1
+        
+
+    table_final_bboxs.sort(key = lambda LURD: (LURD[1], LURD[0]))
+    curPage_all_table_bboxs = table_final_bboxs
+    return curPage_all_table_bboxs
+
diff --git a/magic_pdf/pre_proc/equations_replace.py b/magic_pdf/pre_proc/equations_replace.py
new file mode 100644
index 0000000000000000000000000000000000000000..289fbf3fe3625f74b4f38b92b3bdabadfa1fd75d
--- /dev/null
+++ b/magic_pdf/pre_proc/equations_replace.py
@@ -0,0 +1,559 @@
+"""
+对pymupdf返回的结构里的公式进行替换，替换为模型识别的公式结果
+"""
+
+from magic_pdf.libs.commons import fitz
+import json
+import os
+from pathlib import Path
+from loguru import logger
+from magic_pdf.libs.ocr_content_type import ContentType
+
+TYPE_INLINE_EQUATION = ContentType.InlineEquation
+TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
+
+
+def combine_chars_to_pymudict(block_dict, char_dict):
+    """
+    把block级别的pymupdf 结构里加入char结构
+    """
+    # 因为block_dict 被裁剪过，因此先把他和char_dict文字块对齐，才能进行补充
+    char_map = {tuple(item["bbox"]): item for item in char_dict}
+
+    for i in range(len(block_dict)):  # blcok
+        block = block_dict[i]
+        key = block["bbox"]
+        char_dict_item = char_map[tuple(key)]
+        char_dict_map = {tuple(item["bbox"]): item for item in char_dict_item["lines"]}
+        for j in range(len(block["lines"])):
+            lines = block["lines"][j]
+            with_char_lines = char_dict_map[lines["bbox"]]
+            for k in range(len(lines["spans"])):
+                spans = lines["spans"][k]
+                try:
+                    chars = with_char_lines["spans"][k]["chars"]
+                except Exception as e:
+                    logger.error(char_dict[i]["lines"][j])
+
+                spans["chars"] = chars
+
+    return block_dict
+
+
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, min_bbox):
+    """
+    计算box1和box2的重叠面积占最小面积的box的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], min_bbox[0])
+    y_top = max(bbox1[1], min_bbox[1])
+    x_right = min(bbox1[2], min_bbox[2])
+    y_bottom = min(bbox1[3], min_bbox[3])
+
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = (min_bbox[3] - min_bbox[1]) * (min_bbox[2] - min_bbox[0])
+    if min_box_area == 0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+
+
+def _is_xin(bbox1, bbox2):
+    area1 = abs(bbox1[2] - bbox1[0]) * abs(bbox1[3] - bbox1[1])
+    area2 = abs(bbox2[2] - bbox2[0]) * abs(bbox2[3] - bbox2[1])
+    if area1 < area2:
+        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox2, bbox1)
+    else:
+        ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+
+    return ratio > 0.6
+
+
+def remove_text_block_in_interline_equation_bbox(interline_bboxes, text_blocks):
+    """消除掉整个块都在行间公式块内部的文本块"""
+    for eq_bbox in interline_bboxes:
+        removed_txt_blk = []
+        for text_blk in text_blocks:
+            text_bbox = text_blk["bbox"]
+            if (
+                calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], text_bbox)
+                >= 0.7
+            ):
+                removed_txt_blk.append(text_blk)
+        for blk in removed_txt_blk:
+            text_blocks.remove(blk)
+
+    return text_blocks
+
+
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠或者包含
+    """
+    if box1 is None or box2 is None:
+        return False
+
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+
+    return not (
+        x1_1 < x0_2  # box1在box2的左边
+        or x0_1 > x1_2  # box1在box2的右边
+        or y1_1 < y0_2  # box1在box2的上边
+        or y0_1 > y1_2
+    )  # box1在box2的下边
+
+
+def remove_text_block_overlap_interline_equation_bbox(
+    interline_eq_bboxes, pymu_block_list
+):
+
+    """消除掉行行内公式有部分重叠的文本块的内容。
+    同时重新计算消除重叠之后文本块的大小"""
+    deleted_block = []
+    for text_block in pymu_block_list:
+        deleted_line = []
+        for line in text_block["lines"]:
+            deleted_span = []
+            for span in line["spans"]:
+                deleted_chars = []
+                for char in span["chars"]:
+                    if any(
+                            [
+                                (calculate_overlap_area_2_minbox_area_ratio(eq_bbox["bbox"], char["bbox"]) > 0.5)
+                                for eq_bbox in interline_eq_bboxes
+                            ]
+                    ):
+                        deleted_chars.append(char)
+                # 检查span里没有char则删除这个span
+                for char in deleted_chars:
+                    span["chars"].remove(char)
+                # 重新计算这个span的大小
+                if len(span["chars"]) == 0:  # 删除这个span
+                    deleted_span.append(span)
+                else:
+                    span["bbox"] = (
+                        min([b["bbox"][0] for b in span["chars"]]),
+                        min([b["bbox"][1] for b in span["chars"]]),
+                        max([b["bbox"][2] for b in span["chars"]]),
+                        max([b["bbox"][3] for b in span["chars"]]),
+                    )
+
+            # 检查这个span
+            for span in deleted_span:
+                line["spans"].remove(span)
+            if len(line["spans"]) == 0:  # 删除这个line
+                deleted_line.append(line)
+            else:
+                line["bbox"] = (
+                    min([b["bbox"][0] for b in line["spans"]]),
+                    min([b["bbox"][1] for b in line["spans"]]),
+                    max([b["bbox"][2] for b in line["spans"]]),
+                    max([b["bbox"][3] for b in line["spans"]]),
+                )
+
+        # 检查这个block是否可以删除
+        for line in deleted_line:
+            text_block["lines"].remove(line)
+        if len(text_block["lines"]) == 0:  # 删除block
+            deleted_block.append(text_block)
+        else:
+            text_block["bbox"] = (
+                min([b["bbox"][0] for b in text_block["lines"]]),
+                min([b["bbox"][1] for b in text_block["lines"]]),
+                max([b["bbox"][2] for b in text_block["lines"]]),
+                max([b["bbox"][3] for b in text_block["lines"]]),
+            )
+
+    # 检查text block删除
+    for block in deleted_block:
+        pymu_block_list.remove(block)
+    if len(pymu_block_list) == 0:
+        return []
+
+    return pymu_block_list
+
+
+def insert_interline_equations_textblock(interline_eq_bboxes, pymu_block_list):
+    """在行间公式对应的地方插上一个伪造的block"""
+    for eq in interline_eq_bboxes:
+        bbox = eq["bbox"]
+        latex_content = eq["latex"]
+        text_block = {
+            "number": len(pymu_block_list),
+            "type": 0,
+            "bbox": bbox,
+            "lines": [
+                {
+                    "spans": [
+                        {
+                            "size": 9.962599754333496,
+                            "type": TYPE_INTERLINE_EQUATION,
+                            "flags": 4,
+                            "font": TYPE_INTERLINE_EQUATION,
+                            "color": 0,
+                            "ascender": 0.9409999847412109,
+                            "descender": -0.3050000071525574,
+                            "latex": latex_content,
+                            "origin": [bbox[0], bbox[1]],
+                            "bbox": bbox,
+                        }
+                    ],
+                    "wmode": 0,
+                    "dir": [1.0, 0.0],
+                    "bbox": bbox,
+                }
+            ],
+        }
+        pymu_block_list.append(text_block)
+
+
+def x_overlap_ratio(box1, box2):
+    a, _, c, _ = box1
+    e, _, g, _ = box2
+
+    # 计算重叠宽度
+    overlap_x = max(min(c, g) - max(a, e), 0)
+
+    # 计算box1的宽度
+    width1 = g - e
+
+    # 计算重叠比例
+    overlap_ratio = overlap_x / width1 if width1 != 0 else 0
+
+    return overlap_ratio
+
+
+def __is_x_dir_overlap(bbox1, bbox2):
+    return not (bbox1[2] < bbox2[0] or bbox1[0] > bbox2[2])
+
+
+def __y_overlap_ratio(box1, box2):
+    """"""
+    _, b, _, d = box1
+    _, f, _, h = box2
+
+    # 计算重叠高度
+    overlap_y = max(min(d, h) - max(b, f), 0)
+
+    # 计算box1的高度
+    height1 = d - b
+
+    # 计算重叠比例
+    overlap_ratio = overlap_y / height1 if height1 != 0 else 0
+
+    return overlap_ratio
+
+
+def replace_line_v2(eqinfo, line):
+    """
+    扫描这一行所有的和公式框X方向重叠的char,然后计算char的左、右x0, x1,位于这个区间内的span删除掉。
+    最后与这个x0,x1有相交的span0, span1内部进行分割。
+    """
+    first_overlap_span = -1
+    first_overlap_span_idx = -1
+    last_overlap_span = -1
+    delete_chars = []
+    for i in range(0, len(line["spans"])):
+        if "chars" not in line["spans"][i]:
+            continue
+
+        if line["spans"][i].get("_type", None) is not None:
+            continue  # 忽略，因为已经是插入的伪造span公式了
+
+        for char in line["spans"][i]["chars"]:
+            if __is_x_dir_overlap(eqinfo["bbox"], char["bbox"]):
+                line_txt = ""
+                for span in line["spans"]:
+                    span_txt = "<span>"
+                    for ch in span["chars"]:
+                        span_txt = span_txt + ch["c"]
+
+                    span_txt = span_txt + "</span>"
+
+                    line_txt = line_txt + span_txt
+
+                if first_overlap_span_idx == -1:
+                    first_overlap_span = line["spans"][i]
+                    first_overlap_span_idx = i
+                last_overlap_span = line["spans"][i]
+                delete_chars.append(char)
+
+    # 第一个和最后一个char要进行检查，到底属于公式多还是属于正常span多
+    if len(delete_chars) > 0:
+        ch0_bbox = delete_chars[0]["bbox"]
+        if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
+            delete_chars.remove(delete_chars[0])
+    if len(delete_chars) > 0:
+        ch0_bbox = delete_chars[-1]["bbox"]
+        if x_overlap_ratio(eqinfo["bbox"], ch0_bbox) < 0.51:
+            delete_chars.remove(delete_chars[-1])
+
+    # 计算x方向上被删除区间内的char的真实x0, x1
+    if len(delete_chars):
+        x0, x1 = min([b["bbox"][0] for b in delete_chars]), max(
+            [b["bbox"][2] for b in delete_chars]
+        )
+    else:
+        # logger.debug(f"行内公式替换没有发生，尝试下一行匹配, eqinfo={eqinfo}")
+        return False
+
+    # 删除位于x0, x1这两个中间的span
+    delete_span = []
+    for span in line["spans"]:
+        span_box = span["bbox"]
+        if x0 <= span_box[0] and span_box[2] <= x1:
+            delete_span.append(span)
+    for span in delete_span:
+        line["spans"].remove(span)
+
+    equation_span = {
+        "size": 9.962599754333496,
+        "type": TYPE_INLINE_EQUATION,
+        "flags": 4,
+        "font": TYPE_INLINE_EQUATION,
+        "color": 0,
+        "ascender": 0.9409999847412109,
+        "descender": -0.3050000071525574,
+        "latex": "",
+        "origin": [337.1410153102337, 216.0205245153934],
+        "bbox": eqinfo["bbox"]
+    }
+    # equation_span = line['spans'][0].copy()
+    equation_span["latex"] = eqinfo['latex']
+    equation_span["bbox"] = [x0, equation_span["bbox"][1], x1, equation_span["bbox"][3]]
+    equation_span["origin"] = [equation_span["bbox"][0], equation_span["bbox"][1]]
+    equation_span["chars"] = delete_chars
+    equation_span["type"] = TYPE_INLINE_EQUATION
+    equation_span["_eq_bbox"] = eqinfo["bbox"]
+    line["spans"].insert(first_overlap_span_idx + 1, equation_span)  # 放入公式
+
+    # logger.info(f"==>text is 【{line_txt}】, equation is 【{eqinfo['latex_text']}】")
+
+    # 第一个、和最后一个有overlap的span进行分割,然后插入对应的位置
+    first_span_chars = [
+        char
+        for char in first_overlap_span["chars"]
+        if (char["bbox"][2] + char["bbox"][0]) / 2 < x0
+    ]
+    tail_span_chars = [
+        char
+        for char in last_overlap_span["chars"]
+        if (char["bbox"][0] + char["bbox"][2]) / 2 > x1
+    ]
+
+    if len(first_span_chars) > 0:
+        first_overlap_span["chars"] = first_span_chars
+        first_overlap_span["text"] = "".join([char["c"] for char in first_span_chars])
+        first_overlap_span["bbox"] = (
+            first_overlap_span["bbox"][0],
+            first_overlap_span["bbox"][1],
+            max([chr["bbox"][2] for chr in first_span_chars]),
+            first_overlap_span["bbox"][3],
+        )
+        # first_overlap_span['_type'] = "first"
+    else:
+        # 删掉
+        if first_overlap_span not in delete_span:
+            line["spans"].remove(first_overlap_span)
+
+    if len(tail_span_chars) > 0:
+        min_of_tail_span_x0 = min([chr["bbox"][0] for chr in tail_span_chars])
+        min_of_tail_span_y0 = min([chr["bbox"][1] for chr in tail_span_chars])
+        max_of_tail_span_x1 = max([chr["bbox"][2] for chr in tail_span_chars])
+        max_of_tail_span_y1 = max([chr["bbox"][3] for chr in tail_span_chars])
+
+        if last_overlap_span == first_overlap_span:  # 这个时候应该插入一个新的
+            tail_span_txt = "".join([char["c"] for char in tail_span_chars])
+            last_span_to_insert = last_overlap_span.copy()
+            last_span_to_insert["chars"] = tail_span_chars
+            last_span_to_insert["text"] = "".join(
+                [char["c"] for char in tail_span_chars]
+            )
+            if equation_span["bbox"][2] >= last_overlap_span["bbox"][2]:
+                last_span_to_insert["bbox"] = (
+                    min_of_tail_span_x0,
+                    min_of_tail_span_y0,
+                    max_of_tail_span_x1,
+                    max_of_tail_span_y1
+                )
+            else:
+                last_span_to_insert["bbox"] = (
+                    min([chr["bbox"][0] for chr in tail_span_chars]),
+                    last_overlap_span["bbox"][1],
+                    last_overlap_span["bbox"][2],
+                    last_overlap_span["bbox"][3],
+                )
+            # 插入到公式对象之后
+            equation_idx = line["spans"].index(equation_span)
+            line["spans"].insert(equation_idx + 1, last_span_to_insert)  # 放入公式
+        else:  # 直接修改原来的span
+            last_overlap_span["chars"] = tail_span_chars
+            last_overlap_span["text"] = "".join([char["c"] for char in tail_span_chars])
+            last_overlap_span["bbox"] = (
+                min([chr["bbox"][0] for chr in tail_span_chars]),
+                last_overlap_span["bbox"][1],
+                last_overlap_span["bbox"][2],
+                last_overlap_span["bbox"][3],
+            )
+    else:
+        # 删掉
+        if (
+            last_overlap_span not in delete_span
+            and last_overlap_span != first_overlap_span
+        ):
+            line["spans"].remove(last_overlap_span)
+
+    remain_txt = ""
+    for span in line["spans"]:
+        span_txt = "<span>"
+        for char in span["chars"]:
+            span_txt = span_txt + char["c"]
+
+        span_txt = span_txt + "</span>"
+
+        remain_txt = remain_txt + span_txt
+
+    # logger.info(f"<== succ replace, text is 【{remain_txt}】, equation is 【{eqinfo['latex_text']}】")
+
+    return True
+
+
+def replace_eq_blk(eqinfo, text_block):
+    """替换行内公式"""
+    for line in text_block["lines"]:
+        line_bbox = line["bbox"]
+        if (
+            _is_xin(eqinfo["bbox"], line_bbox)
+            or __y_overlap_ratio(eqinfo["bbox"], line_bbox) > 0.6
+        ):  # 定位到行, 使用y方向重合率是因为有的时候，一个行的宽度会小于公式位置宽度：行很高，公式很窄，
+            replace_succ = replace_line_v2(eqinfo, line)
+            if (
+                not replace_succ
+            ):  # 有的时候，一个pdf的line高度从API里会计算的有问题，因此在行内span级别会替换不成功，这就需要继续重试下一行
+                continue
+            else:
+                break
+    else:
+        return False
+    return True
+
+
+def replace_inline_equations(inline_equation_bboxes, raw_text_blocks):
+    """替换行内公式"""
+    for eqinfo in inline_equation_bboxes:
+        eqbox = eqinfo["bbox"]
+        for blk in raw_text_blocks:
+            if _is_xin(eqbox, blk["bbox"]):
+                if not replace_eq_blk(eqinfo, blk):
+                    logger.warning(f"行内公式没有替换成功：{eqinfo} ")
+                else:
+                    break
+
+    return raw_text_blocks
+
+
+def remove_chars_in_text_blocks(text_blocks):
+    """删除text_blocks里的char"""
+    for blk in text_blocks:
+        for line in blk["lines"]:
+            for span in line["spans"]:
+                _ = span.pop("chars", "no such key")
+    return text_blocks
+
+
+def replace_equations_in_textblock(
+    raw_text_blocks, inline_equation_bboxes, interline_equation_bboxes
+):
+    """
+    替换行间和和行内公式为latex
+    """
+    raw_text_blocks = remove_text_block_in_interline_equation_bbox(
+        interline_equation_bboxes, raw_text_blocks
+    )  # 消除重叠：第一步，在公式内部的
+
+    raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
+        interline_equation_bboxes, raw_text_blocks
+    )  # 消重，第二步，和公式覆盖的
+
+    insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
+    raw_text_blocks = replace_inline_equations(inline_equation_bboxes, raw_text_blocks)
+    return raw_text_blocks
+
+
+def draw_block_on_pdf_with_txt_replace_eq_bbox(json_path, pdf_path):
+    """ """
+    new_pdf = f"{Path(pdf_path).parent}/{Path(pdf_path).stem}.step3-消除行内公式text_block.pdf"
+    with open(json_path, "r", encoding="utf-8") as f:
+        obj = json.loads(f.read())
+
+    if os.path.exists(new_pdf):
+        os.remove(new_pdf)
+    new_doc = fitz.open("")
+
+    doc = fitz.open(pdf_path)
+    new_doc = fitz.open(pdf_path)
+    for i in range(len(new_doc)):
+        page = new_doc[i]
+        inline_equation_bboxes = obj[f"page_{i}"]["inline_equations"]
+        interline_equation_bboxes = obj[f"page_{i}"]["interline_equations"]
+        raw_text_blocks = obj[f"page_{i}"]["preproc_blocks"]
+        raw_text_blocks = remove_text_block_in_interline_equation_bbox(
+            interline_equation_bboxes, raw_text_blocks
+        )  # 消除重叠：第一步，在公式内部的
+        raw_text_blocks = remove_text_block_overlap_interline_equation_bbox(
+            interline_equation_bboxes, raw_text_blocks
+        )  # 消重，第二步，和公式覆盖的
+        insert_interline_equations_textblock(interline_equation_bboxes, raw_text_blocks)
+        raw_text_blocks = replace_inline_equations(
+            inline_equation_bboxes, raw_text_blocks
+        )
+
+        # 为了检验公式是否重复，把每一行里，含有公式的span背景改成黄色的
+        color_map = [fitz.pdfcolor["blue"], fitz.pdfcolor["green"]]
+        j = 0
+        for blk in raw_text_blocks:
+            for i, line in enumerate(blk["lines"]):
+
+                # line_box = line['bbox']
+                # shape = page.new_shape()
+                # shape.draw_rect(line_box)
+                # shape.finish(color=fitz.pdfcolor['red'], fill=color_map[j%2], fill_opacity=0.3)
+                # shape.commit()
+                # j = j+1
+
+                for i, span in enumerate(line["spans"]):
+                    shape_page = page.new_shape()
+                    span_type = span.get("_type")
+                    color = fitz.pdfcolor["blue"]
+                    if span_type == "first":
+                        color = fitz.pdfcolor["blue"]
+                    elif span_type == "tail":
+                        color = fitz.pdfcolor["green"]
+                    elif span_type == TYPE_INLINE_EQUATION:
+                        color = fitz.pdfcolor["black"]
+                    else:
+                        color = None
+
+                    b = span["bbox"]
+                    shape_page.draw_rect(b)
+
+                    shape_page.finish(color=None, fill=color, fill_opacity=0.3)
+                    shape_page.commit()
+
+    new_doc.save(new_pdf)
+    logger.info(f"save ok {new_pdf}")
+    final_json = json.dumps(obj, ensure_ascii=False, indent=2)
+    with open("equations_test/final_json.json", "w") as f:
+        f.write(final_json)
+
+    return new_pdf
+
+
+if __name__ == "__main__":
+    # draw_block_on_pdf_with_txt_replace_eq_bbox(new_json_path, equation_color_pdf)
+    pass
diff --git a/magic_pdf/pre_proc/fix_image.py b/magic_pdf/pre_proc/fix_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f83570d6ca0f30f1c9bf0ddf90a916fdb80c91
--- /dev/null
+++ b/magic_pdf/pre_proc/fix_image.py
@@ -0,0 +1,244 @@
+
+
+
+import re    
+from magic_pdf.libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
+
+from magic_pdf.libs.textbase import get_text_block_base_info
+
+def fix_image_vertical(image_bboxes:list, text_blocks:list):
+    """
+    修正图片的位置
+    如果图片与文字block发生一定重叠（也就是图片切到了一部分文字），那么减少图片边缘，让文字和图片不再重叠。
+    只对垂直方向进行。
+    """
+    for image_bbox in image_bboxes:
+        for text_block in text_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
+                if text_bbox[1] < image_bbox[1]:#在图片上方
+                    image_bbox[1] = text_bbox[3]+1
+                elif text_bbox[3]>image_bbox[3]:#在图片下方
+                    image_bbox[3] = text_bbox[1]-1
+                
+    return image_bboxes
+
+def __merge_if_common_edge(bbox1, bbox2):
+    x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
+    x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
+
+    # 检查是否有公共的水平边
+    if y_min_1 == y_min_2 or y_max_1 == y_max_2:
+        # 确保一个框的x范围在另一个框的x范围内
+        if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+
+    # 检查是否有公共的垂直边
+    if x_min_1 == x_min_2 or x_max_1 == x_max_2:
+        # 确保一个框的y范围在另一个框的y范围内
+        if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+
+    # 如果没有公共边
+    return None
+
+def fix_seperated_image(image_bboxes:list):
+    """
+    如果2个图片有一个边重叠，那么合并2个图片
+    """
+    new_images = []
+    droped_img_idx = []
+            
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
+            if new_img is not None:
+                new_images.append(new_img)
+                droped_img_idx.append(i)
+                droped_img_idx.append(j)
+                break
+            
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+            
+    return new_images
+
+
+def __check_img_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r"^(fig|figure).*", r"^(scheme).*"]
+    text = text.strip()
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+    return False
+
+def __get_fig_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+
+
+def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
+    """
+    继续向下方寻找和图片caption字号，字体，颜色一样的文字框，合并入caption。
+    text_block是已经找到的图片catpion（这个caption可能不全，多行被划分到多个pymu block里了）
+    """
+    combined_image_caption_text_block = list(text_block.copy()['bbox'])
+    base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
+    while True:
+        tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
+        if not tb_add:
+            break
+        tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
+        if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
+            combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
+            combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
+            combined_image_caption_text_block[3] = tb_add['bbox'][3]
+        else:
+            break
+            
+    image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
+    image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
+    image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
+    image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
+    text_block['_image_caption'] = True
+        
+
+def include_img_title(pymu_blocks, image_bboxes: list):
+    """
+    向上方和下方寻找符合图片title的文本block，合并到图片里
+    如果图片上下都有fig的情况怎么办？寻找标题距离最近的那个。
+    ---
+    增加对左侧和右侧图片标题的寻找
+    """
+
+    
+    for tb in image_bboxes:
+        # 优先找下方的
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_btn:
+                txt, line_cnt = __get_fig_caption_text(text_block_btn)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题，或者有时候图片下方文字没有被图片识别模型放入图片里
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_btn['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_btn['bbox'][3] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_fig_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    b = text_block_top['bbox']
+                    temp_box[1] = b[1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        
+        if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
+            btn_text, _ = __get_fig_caption_text(text_block_btn)
+            top_text, _ = __get_fig_caption_text(text_block_top)
+            if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
+                # 取距离图片最近的
+                btn_text_distance = text_block_btn['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                if btn_text_distance<top_text_distance: # caption在下方
+                    __find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
+                else:
+                    text_block = text_block_top
+                    tb[0] = min(tb[0], text_block['bbox'][0])
+                    tb[1] = min(tb[1], text_block['bbox'][1])
+                    tb[2] = max(tb[2], text_block['bbox'][2])
+                    tb[3] = max(tb[3], text_block['bbox'][3])
+                    text_block_btn['_image_caption'] = True
+                continue
+            
+        text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
+        if text_block and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                # 发现特征之后，继续向相同方向寻找（想同颜色，想同大小，想同字体）的textblock
+                __find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
+                continue
+            
+        text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
+        if text_block  and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_image_caption'] = True
+                continue
+            
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(left_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+            
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(right_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+
+    return image_bboxes
+
+
+def combine_images(image_bboxes:list):
+    """
+    合并图片，如果图片有重叠，那么合并
+    """
+    new_images = []
+    droped_img_idx = []
+            
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
+                # 合并
+                image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                droped_img_idx.append(j)
+            
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+            
+    return new_images
\ No newline at end of file
diff --git a/magic_pdf/pre_proc/fix_table.py b/magic_pdf/pre_proc/fix_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..932e27aab27163b1381fa7832e10c9edde1ed383
--- /dev/null
+++ b/magic_pdf/pre_proc/fix_table.py
@@ -0,0 +1,270 @@
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+import re
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
+
+
+## version 2
+def get_merged_line(page):
+    """
+    这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线，并且将断开的线段进行了合并。
+    :param page :fitz读取的当前页的内容
+    """
+    drawings_bbox = []
+    drawings_line = []
+    drawings = page.get_drawings()  # 提取所有的矢量
+    for p in drawings:
+        drawings_bbox.append(p["rect"].irect)  # (L, U, R, D)
+
+    lines = []
+    for L, U, R, D in drawings_bbox:
+        if abs(D - U) <= 3: # 筛出水平的横线
+            lines.append((L, U, R, D))
+    U_groups = []
+    visited = [False for _ in range(len(lines))]
+    for i, (L1, U1, R1, D1) in enumerate(lines):
+        if visited[i] == True:
+            continue
+        tmp_g = [(L1, U1, R1, D1)]
+        for j, (L2, U2, R2, D2) in enumerate(lines):
+            if i == j:
+                continue
+            if visited[j] == True:
+                continue
+            if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5:   # 把高度一致的线放进一个group
+                tmp_g.append((L2, U2, R2, D2))
+                visited[j] = True
+        U_groups.append(tmp_g)
+        
+    res = []
+    for group in U_groups:
+        group.sort(key = lambda LURD: (LURD[0], LURD[2]))
+        LL, UU, RR, DD = group[0]
+        for i, (L1, U1, R1, D1) in enumerate(group):
+            if (L1 - RR) >= 5:
+                cur_line = (LL, UU, RR, DD)
+                res.append(cur_line)
+                LL = L1
+            else:
+                RR = max(RR, R1)
+        cur_line = (LL, UU, RR, DD)
+        res.append(cur_line)
+    return res
+
+def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
+    """
+    :param page :fitz读取的当前页的内容
+    :param table_bboxes: list类型，每一个元素是一个元祖 (L, U, R, D)
+    :param include_table_title: 是否将表格的标题也圈进来
+    :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
+    """
+    
+    drawings_lines = get_merged_line(page)
+    fix_table_bboxes = []
+    
+    for table in table_bboxes:
+        (L, U, R, D) = table
+        fix_table_L = []
+        fix_table_U = []
+        fix_table_R = []
+        fix_table_D = []
+        width = R - L
+        width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
+        height = D - U
+        height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
+        for line in drawings_lines:
+            if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
+                if (U - height_range) < line[1] < (U + height_range): # 上边界，在一定的高度范围内
+                    fix_table_U.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+                elif (D - height_range) < line[1] < (D + height_range): # 下边界，在一定的高度范围内
+                    fix_table_D.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+
+        if fix_table_U:
+            U = min(fix_table_U)
+        if fix_table_D:
+            D = max(fix_table_D)
+        if fix_table_L:
+            L = min(fix_table_L)
+        if fix_table_R:
+            R = max(fix_table_R)
+            
+        if include_table_title:   # 需要将表格标题包括
+            text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]   # 所有的text的block
+            incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))]  # 将与表格完全没有任何遮挡的文字筛除掉（比如另一栏的文字）
+            upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0]  # 将在表格线以上的text block筛选出来
+            sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序，如果是同一个高度，则先左再右
+            
+            for idx in range(scan_line_num):   
+                if idx+1 <= len(sorted_filtered_text_blocks):
+                    line_temp = sorted_filtered_text_blocks[idx]['lines']
+                    if line_temp:
+                        text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
+                        check_en = re.match('Table', text) # 检查是否有Table开头的(英文）
+                        check_ch = re.match('表', text) # 检查是否有Table开头的(中文）
+                        if check_en or check_ch:
+                            if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
+                                U = sorted_filtered_text_blocks[idx]['bbox'][1]
+                                  
+        fix_table_bboxes.append([L-2, U-2, R+2, D+2])
+    
+    return fix_table_bboxes
+
+def __check_table_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r'^table\s\d+']
+    
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+        else:
+            return False
+         
+         
+def fix_table_text_block(pymu_blocks, table_bboxes: list):
+    """
+    调整table, 如果table和上下的text block有相交区域，则将table的上下边界调整到text block的上下边界
+    例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
+    """
+    for tb in table_bboxes:
+        (L, U, R, D) = tb
+        for block in pymu_blocks:
+            if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
+                txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
+                if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title，那么不调整。因为下一步会统一调整，如果这里进行了调整，后面的调整会造成调整到其他table的title上（在连续出现2个table的情况下）。
+                    tb[0] = min(tb[0], block['bbox'][0])
+                    tb[1] = min(tb[1], block['bbox'][1])
+                    tb[2] = max(tb[2], block['bbox'][2])
+                    tb[3] = max(tb[3], block['bbox'][3])
+                    block['_table'] = True # 占位，防止其他table再次占用
+                    
+                """如果是个table的title，但是有部分重叠，那么修正这个title,使得和table不重叠"""
+                if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
+                    block['bbox'] = list(block['bbox'])
+                    if block['bbox'][3] > U:
+                        block['bbox'][3] = U-1
+                    if block['bbox'][1] < D:
+                        block['bbox'][1] = D+1
+                
+                
+    return table_bboxes
+
+
+def __get_table_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+
+
+def include_table_title(pymu_blocks, table_bboxes: list):
+    """
+    把表格的title也包含进来，扩展到table_bbox上
+    """
+    for tb in table_bboxes:
+        max_find_cnt = 3 # 上上最多找3次
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_table_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt -1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[1] = text_block_top['bbox'][1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+            
+        max_find_cnt = 3 # 向下找
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_bottom:
+                txt, line_cnt = __get_table_caption_text(text_block_bottom)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_bottom['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_bottom['bbox'][3]
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        
+        if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
+            btn_text, _ = __get_table_caption_text(text_block_bottom)
+            top_text, _ = __get_table_caption_text(text_block_top)
+            if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
+                # 取距离最近的
+                btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block_bottom['_table_caption'] = True
+                continue
+
+        # 如果以上条件都不满足，那么就向下找
+        text_block = text_block_top
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+            
+        text_block = text_block_bottom
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(text_block)
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+        
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(left_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+            
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(right_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+                
+    return table_bboxes
\ No newline at end of file
diff --git a/magic_pdf/pre_proc/main_text_font.py b/magic_pdf/pre_proc/main_text_font.py
new file mode 100644
index 0000000000000000000000000000000000000000..552e8bcc0b24e4ef243e5caec59d95625bbe9b7b
--- /dev/null
+++ b/magic_pdf/pre_proc/main_text_font.py
@@ -0,0 +1,23 @@
+import collections
+
+
+def get_main_text_font(pdf_docs):
+    font_names = collections.Counter()
+    for page in pdf_docs:
+        blocks = page.get_text('dict')['blocks']
+        if blocks is not None:
+            for block in blocks:
+                lines = block.get('lines')
+                if lines is not None:
+                    for line in lines:
+                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                     'font' in span and len(span['text']) > 0]
+                        if span_font:
+                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
+                            # font_names.append(font_name for font_name in span_font)
+                            # block_fonts.append(font_name for font_name in span_font)
+                            for font, count in span_font:
+                                font_names[font] += count
+    main_text_font = font_names.most_common(1)[0][0]
+    return main_text_font
+
diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f07276de36175fc0637f64522f88af4e3cb90c4
--- /dev/null
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -0,0 +1,115 @@
+from loguru import logger
+
+from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
+    calculate_iou
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import BlockType
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
+
+
+def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
+                                        title_blocks, interline_equation_blocks, page_w, page_h):
+    all_bboxes = []
+    all_discarded_blocks = []
+    for image in img_blocks:
+        x0, y0, x1, y1 = image['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
+
+    for table in table_blocks:
+        x0, y0, x1, y1 = table['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
+
+    for text in text_blocks:
+        x0, y0, x1, y1 = text['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
+
+    for title in title_blocks:
+        x0, y0, x1, y1 = title['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
+
+    for interline_equation in interline_equation_blocks:
+        x0, y0, x1, y1 = interline_equation['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
+
+    '''block嵌套问题解决'''
+    '''文本框与标题框重叠，优先信任文本框'''
+    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
+    '''任何框体与舍弃框重叠，优先信任舍弃框'''
+    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+    # @todo interline_equation 与title或text框冲突的情况，分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+
+    '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
+        # 将footnote加入到all_bboxes中，用来计算layout
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
+
+    '''经过以上处理后，还存在大框套小框的情况，则删除小框'''
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
+    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+
+    return all_bboxes, all_discarded_blocks, drop_reasons
+
+
+def fix_text_overlap_title_blocks(all_bboxes):
+    # 先提取所有text和title block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Text:
+            text_blocks.append(block)
+    title_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Title:
+            title_blocks.append(block)
+
+    for text_block in text_blocks:
+        for title_block in title_blocks:
+            text_block_bbox = text_block[:4]
+            title_block_bbox = title_block[:4]
+            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
+                all_bboxes.remove(title_block)
+
+    return all_bboxes
+
+
+def remove_need_drop_blocks(all_bboxes, discarded_blocks):
+    need_remove = []
+    for block in all_bboxes:
+        for discarded_block in discarded_blocks:
+            block_bbox = block[:4]
+            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
+                if block not in need_remove:
+                    need_remove.append(block)
+                    break
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+    return all_bboxes
+
+
+def remove_overlaps_min_blocks(all_bboxes):
+    #  删除重叠blocks中较小的那些
+    need_remove = []
+    for block1 in all_bboxes:
+        for block2 in all_bboxes:
+            if block1 != block2:
+                block1_bbox = block1[:4]
+                block2_bbox = block2[:4]
+                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
+                if overlap_box is not None:
+                    bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
+                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
+                        need_remove.append(bbox_to_remove)
+
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+
+    return all_bboxes
diff --git a/magic_pdf/pre_proc/ocr_detect_layout.py b/magic_pdf/pre_proc/ocr_detect_layout.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dad3593d69696b65b5650e9b9dbadd33c11b595
--- /dev/null
+++ b/magic_pdf/pre_proc/ocr_detect_layout.py
@@ -0,0 +1,133 @@
+import fitz
+
+from magic_pdf.layout.layout_sort import get_bboxes_layout
+from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def get_center_point(bbox):
+    """
+    根据边界框坐标信息，计算出该边界框的中心点坐标。
+    Args:
+        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        list: 中心点坐标信息，包含两个元素，分别为x坐标和y坐标。
+    """
+    return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+
+
+def get_area(bbox):
+    """
+    根据边界框坐标信息，计算出该边界框的面积。
+    Args:
+        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
+    Returns:
+        float: 该边界框的面积。
+    """
+    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
+
+
+def adjust_layouts(layout_bboxes, page_boundry, page_id):
+    # 遍历所有布局框
+    for i in range(len(layout_bboxes)):
+        # 遍历当前布局框之后的布局框
+        for j in range(i + 1, len(layout_bboxes)):
+            # 判断两个布局框是否重叠
+            if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
+                # 计算每个布局框的中心点坐标和面积
+                area_i = get_area(layout_bboxes[i])
+                area_j = get_area(layout_bboxes[j])
+
+                # 较大布局框和较小布局框的赋值
+                if area_i > area_j:
+                    larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
+                else:
+                    larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
+
+                center_large = get_center_point(larger_layout)
+                center_small = get_center_point(smaller_layout)
+                # 计算横向和纵向的距离差
+                distance_x = center_large[0] - center_small[0]
+                distance_y = center_large[1] - center_small[1]
+
+                # 根据距离差判断重叠方向并修正边界
+                if abs(distance_x) > abs(distance_y):  # 左右重叠
+                    if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
+                        larger_layout[0] = smaller_layout[2]+1
+                    if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
+                        larger_layout[2] = smaller_layout[0]-1
+                else:  # 上下重叠
+                    if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
+                        larger_layout[1] = smaller_layout[3]+1
+                    if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
+                        larger_layout[3] = smaller_layout[1]-1
+    # 排序调整布局边界框列表
+    new_bboxes = []
+    for layout_bbox in layout_bboxes:
+        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
+
+    layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes, layout_tree
+
+
+def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
+    """
+    对输入的布局信息进行解析，提取出每个子布局的边界框，并对所有子布局进行排序调整。
+
+    Args:
+        layout_info (list): 包含子布局信息的列表，每个子布局信息为字典类型，包含'poly'字段，表示子布局的边界框坐标信息。
+
+    Returns:
+        list: 经过排序调整后的所有子布局边界框信息的列表，每个边界框信息为字典类型，包含'layout_bbox'字段，表示边界框的坐标信息。
+
+    """
+    page_id = ocr_page_info['page_info']['page_no']-1
+    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
+    # 初始化布局边界框列表
+    layout_bboxes = []
+    # 遍历每个子布局
+    for sub_layout in layout_info:
+        # 提取子布局的边界框坐标信息
+        x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
+        bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
+                int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
+
+        # 将子布局的边界框添加到列表中
+        layout_bboxes.append(bbox)
+
+    # 初始化新的布局边界框列表
+    new_layout_bboxes = []
+    # 遍历每个布局边界框
+    for i in range(len(layout_bboxes)):
+        # 初始化标记变量，用于判断当前边界框是否需要保留
+        keep = True
+        # 获取当前边界框的坐标信息
+        box_i = layout_bboxes[i]
+
+        # 遍历其他边界框
+        for j in range(len(layout_bboxes)):
+            # 排除当前边界框自身
+            if i != j:
+                # 获取其他边界框的坐标信息
+                box_j = layout_bboxes[j]
+                # 检测box_i是否被box_j包含
+                if _is_in(box_i, box_j):
+                    # 如果当前边界框被其他边界框包含，则标记为不需要保留
+                    keep = False
+                    # 跳出内层循环
+                    break
+
+        # 如果当前边界框需要保留，则添加到新的布局边界框列表中
+        if keep:
+            new_layout_bboxes.append(layout_bboxes[i])
+
+    # 对新的布局边界框列表进行排序调整
+    page_width = page.rect.width
+    page_height = page.rect.height
+    page_boundry = [0, 0, page_width, page_height]
+    layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
+
+    # 返回排序调整后的布局边界框列表
+    return layout_bboxes, layout_tree
diff --git a/magic_pdf/pre_proc/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c1f89f2c7a4c98fd4ae7e971d84f68644c2815
--- /dev/null
+++ b/magic_pdf/pre_proc/ocr_dict_merge.py
@@ -0,0 +1,336 @@
+from loguru import logger
+
+from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
+    calculate_overlap_area_in_bbox1_area_ratio, _is_in_or_part_overlap_with_area_ratio
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.pre_proc.ocr_span_list_modify import modify_y_axis, modify_inline_equation
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_span
+
+
+# 将每一个line中的span从左到右排序
+def line_sort_spans_by_left_to_right(lines):
+    line_objects = []
+    for line in lines:
+        # 按照x0坐标排序
+        line.sort(key=lambda span: span['bbox'][0])
+        line_bbox = [
+            min(span['bbox'][0] for span in line),  # x0
+            min(span['bbox'][1] for span in line),  # y0
+            max(span['bbox'][2] for span in line),  # x1
+            max(span['bbox'][3] for span in line),  # y1
+        ]
+        line_objects.append({
+            "bbox": line_bbox,
+            "spans": line,
+        })
+    return line_objects
+
+
+def merge_spans_to_line(spans):
+    if len(spans) == 0:
+        return []
+    else:
+        # 按照y0坐标排序
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        for span in spans[1:]:
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型，同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
+                    current_line):
+                # 则开始新行
+                lines.append(current_line)
+                current_line = [span]
+                continue
+
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                current_line.append(span)
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                current_line = [span]
+
+        # 添加最后一行
+        if current_line:
+            lines.append(current_line)
+
+        return lines
+
+
+def merge_spans_to_line_by_layout(spans, layout_bboxes):
+    lines = []
+    new_spans = []
+    dropped_spans = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
+        # 遍历spans,将每个span放入对应的layout中
+        layout_sapns = []
+        for span in spans:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.6:
+                layout_sapns.append(span)
+        # 如果layout_sapns不为空，则放入new_spans中
+        if len(layout_sapns) > 0:
+            new_spans.append(layout_sapns)
+            # 从spans删除已经放入layout_sapns中的span
+            for layout_sapn in layout_sapns:
+                spans.remove(layout_sapn)
+
+    if len(new_spans) > 0:
+        for layout_sapns in new_spans:
+            layout_lines = merge_spans_to_line(layout_sapns)
+            lines.extend(layout_lines)
+
+    # 对line中的span进行排序
+    lines = line_sort_spans_by_left_to_right(lines)
+
+    for span in spans:
+        span['tag'] = DropTag.NOT_IN_LAYOUT
+        dropped_spans.append(span)
+
+    return lines, dropped_spans
+
+
+def merge_lines_to_block(lines):
+    # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
+    blocks = []
+    for line in lines:
+        blocks.append(
+            {
+                "bbox": line["bbox"],
+                "lines": [line],
+            }
+        )
+    return blocks
+
+
+def sort_blocks_by_layout(all_bboxes, layout_bboxes):
+    new_blocks = []
+    sort_blocks = []
+    for item in layout_bboxes:
+        layout_bbox = item['layout_bbox']
+
+        # 遍历blocks,将每个blocks放入对应的layout中
+        layout_blocks = []
+        for block in all_bboxes:
+            # 如果是footnote则跳过
+            if block[7] == BlockType.Footnote:
+                continue
+            block_bbox = block[:4]
+            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8:
+                layout_blocks.append(block)
+
+        # 如果layout_blocks不为空，则放入new_blocks中
+        if len(layout_blocks) > 0:
+            new_blocks.append(layout_blocks)
+            # 从all_bboxes删除已经放入layout_blocks中的block
+            for layout_block in layout_blocks:
+                all_bboxes.remove(layout_block)
+
+    # 如果new_blocks不为空，则对new_blocks中每个block进行排序
+    if len(new_blocks) > 0:
+        for bboxes_in_layout_block in new_blocks:
+            bboxes_in_layout_block.sort(key=lambda x: x[1])  # 一个layout内部的box，按照y0自上而下排序
+            sort_blocks.extend(bboxes_in_layout_block)
+
+    # sort_blocks中已经包含了当前页面所有最终留下的block，且已经排好了顺序
+    return sort_blocks
+
+
+def fill_spans_in_blocks(blocks, spans, radio):
+    '''
+    将allspans中的span按位置关系，放入blocks中
+    '''
+    block_with_spans = []
+    for block in blocks:
+        block_type = block[7]
+        block_bbox = block[0:4]
+        block_dict = {
+            'type': block_type,
+            'bbox': block_bbox,
+        }
+        block_spans = []
+        for span in spans:
+            span_bbox = span['bbox']
+            if calculate_overlap_area_in_bbox1_area_ratio(span_bbox, block_bbox) > radio:
+                block_spans.append(span)
+
+        '''行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)'''
+        # displayed_list = []
+        # text_inline_lines = []
+        # modify_y_axis(block_spans, displayed_list, text_inline_lines)
+
+        '''模型识别错误的行间公式, type类型转换成行内公式'''
+        # block_spans = modify_inline_equation(block_spans, displayed_list, text_inline_lines)
+
+        '''bbox去除粘连'''  # 去粘连会影响span的bbox，导致后续fill的时候出错
+        # block_spans = remove_overlap_between_bbox_for_span(block_spans)
+
+        block_dict['spans'] = block_spans
+        block_with_spans.append(block_dict)
+
+        # 从spans删除已经放入block_spans中的span
+        if len(block_spans) > 0:
+            for span in block_spans:
+                spans.remove(span)
+
+    return block_with_spans, spans
+
+
+def fix_block_spans(block_with_spans, img_blocks, table_blocks):
+    '''
+    1、img_block和table_block因为包含caption和footnote的关系，存在block的嵌套关系
+        需要将caption和footnote的text_span放入相应img_block和table_block内的
+        caption_block和footnote_block中
+    2、同时需要删除block中的spans字段
+    '''
+    fix_blocks = []
+    for block in block_with_spans:
+        block_type = block['type']
+
+        if block_type == BlockType.Image:
+            block = fix_image_block(block, img_blocks)
+        elif block_type == BlockType.Table:
+            block = fix_table_block(block, table_blocks)
+        elif block_type in [BlockType.Text, BlockType.Title]:
+            block = fix_text_block(block)
+        elif block_type == BlockType.InterlineEquation:
+            block = fix_interline_block(block)
+        else:
+            continue
+        fix_blocks.append(block)
+    return fix_blocks
+
+
+def fix_discarded_block(discarded_block_with_spans):
+    fix_discarded_blocks = []
+    for block in discarded_block_with_spans:
+        block = fix_text_block(block)
+        fix_discarded_blocks.append(block)
+    return fix_discarded_blocks
+
+
+def merge_spans_to_block(spans: list, block_bbox: list, block_type: str):
+    block_spans = []
+    # 如果有img_caption，则将img_block中的text_spans放入img_caption_block中
+    for span in spans:
+        if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block_bbox) > 0.6:
+            block_spans.append(span)
+    block_lines = merge_spans_to_line(block_spans)
+    # 对line中的span进行排序
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block = {
+        'bbox': block_bbox,
+        'type': block_type,
+        'lines': sort_block_lines
+    }
+    return block, block_spans
+
+
+def make_body_block(span: dict, block_bbox: list, block_type: str):
+    # 创建body_block
+    body_line = {
+        'bbox': block_bbox,
+        'spans': [span],
+    }
+    body_block = {
+        'bbox': block_bbox,
+        'type': block_type,
+        'lines': [body_line]
+    }
+    return body_block
+
+
+def fix_image_block(block, img_blocks):
+    block['blocks'] = []
+    # 遍历img_blocks,找到与当前block匹配的img_block
+    for img_block in img_blocks:
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'], img_block['bbox'], 0.95):
+
+            # 创建img_body_block
+            for span in block['spans']:
+                if span['type'] == ContentType.Image and img_block['img_body_bbox'] == span['bbox']:
+                    # 创建img_body_block
+                    img_body_block = make_body_block(span, img_block['img_body_bbox'], BlockType.ImageBody)
+                    block['blocks'].append(img_body_block)
+
+                    # 从spans中移除img_body_block中已经放入的span
+                    block['spans'].remove(span)
+                    break
+
+            # 根据list长度，判断img_block中是否有img_caption
+            if img_block['img_caption_bbox'] is not None:
+                img_caption_block, img_caption_spans = merge_spans_to_block(
+                    block['spans'], img_block['img_caption_bbox'], BlockType.ImageCaption
+                )
+                block['blocks'].append(img_caption_block)
+
+            break
+    del block['spans']
+    return block
+
+
+def fix_table_block(block, table_blocks):
+    block['blocks'] = []
+    # 遍历table_blocks,找到与当前block匹配的table_block
+    for table_block in table_blocks:
+        if _is_in_or_part_overlap_with_area_ratio(block['bbox'], table_block['bbox'], 0.95):
+
+            # 创建table_body_block
+            for span in block['spans']:
+                if span['type'] == ContentType.Table and table_block['table_body_bbox'] == span['bbox']:
+                    # 创建table_body_block
+                    table_body_block = make_body_block(span, table_block['table_body_bbox'], BlockType.TableBody)
+                    block['blocks'].append(table_body_block)
+
+                    # 从spans中移除img_body_block中已经放入的span
+                    block['spans'].remove(span)
+                    break
+
+            # 根据list长度，判断table_block中是否有caption
+            if table_block['table_caption_bbox'] is not None:
+                table_caption_block, table_caption_spans = merge_spans_to_block(
+                    block['spans'], table_block['table_caption_bbox'], BlockType.TableCaption
+                )
+                block['blocks'].append(table_caption_block)
+
+                # 如果table_caption_block_spans不为空
+                if len(table_caption_spans) > 0:
+                    #  一些span已经放入了caption_block中，需要从block['spans']中删除
+                    for span in table_caption_spans:
+                        block['spans'].remove(span)
+
+            # 根据list长度，判断table_block中是否有table_note
+            if table_block['table_footnote_bbox'] is not None:
+                table_footnote_block, table_footnote_spans = merge_spans_to_block(
+                    block['spans'], table_block['table_footnote_bbox'], BlockType.TableFootnote
+                )
+                block['blocks'].append(table_footnote_block)
+
+            break
+    del block['spans']
+    return block
+
+
+def fix_text_block(block):
+    # 文本block中的公式span都应该转换成行内type
+    for span in block['spans']:
+        if span['type'] == ContentType.InterlineEquation:
+            span['type'] = ContentType.InlineEquation
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
+
+
+def fix_interline_block(block):
+    block_lines = merge_spans_to_line(block['spans'])
+    sort_block_lines = line_sort_spans_by_left_to_right(block_lines)
+    block['lines'] = sort_block_lines
+    del block['spans']
+    return block
diff --git a/magic_pdf/pre_proc/ocr_span_list_modify.py b/magic_pdf/pre_proc/ocr_span_list_modify.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1ea2f82b025ed2d491a91babcdda297165dd1
--- /dev/null
+++ b/magic_pdf/pre_proc/ocr_span_list_modify.py
@@ -0,0 +1,258 @@
+from loguru import logger
+
+from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
+    __is_overlaps_y_exceeds_threshold, calculate_iou
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+
+def remove_overlaps_low_confidence_spans(spans):
+    dropped_spans = []
+    #  删除重叠spans中置信度低的的那些
+    for span1 in spans:
+        for span2 in spans:
+            if span1 != span2:
+                if calculate_iou(span1['bbox'], span2['bbox']) > 0.9:
+                    if span1['score'] < span2['score']:
+                        span_need_remove = span1
+                    else:
+                        span_need_remove = span2
+                    if span_need_remove is not None and span_need_remove not in dropped_spans:
+                        dropped_spans.append(span_need_remove)
+
+    if len(dropped_spans) > 0:
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
+
+    return spans, dropped_spans
+
+
+def remove_overlaps_min_spans(spans):
+    dropped_spans = []
+    #  删除重叠spans中较小的那些
+    for span1 in spans:
+        for span2 in spans:
+            if span1 != span2:
+                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
+                if overlap_box is not None:
+                    span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                    if span_need_remove is not None and span_need_remove not in dropped_spans:
+                        dropped_spans.append(span_need_remove)
+
+    if len(dropped_spans) > 0:
+        for span_need_remove in dropped_spans:
+            spans.remove(span_need_remove)
+            span_need_remove['tag'] = DropTag.SPAN_OVERLAP
+
+    return spans, dropped_spans
+
+
+def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
+    # 遍历spans, 判断是否在removed_span_block_bboxes中
+    # 如果是, 则删除该span 否则, 保留该span
+    need_remove_spans = []
+    for span in spans:
+        for removed_bbox in need_remove_spans_bboxes:
+            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                if span not in need_remove_spans:
+                    need_remove_spans.append(span)
+                    break
+
+    if len(need_remove_spans) > 0:
+        for span in need_remove_spans:
+            spans.remove(span)
+
+    return spans
+
+
+def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
+    dropped_spans = []
+    for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
+        # logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
+        need_remove_spans = []
+        for span in spans:
+            # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
+            for removed_bbox in removed_bboxes:
+                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                    need_remove_spans.append(span)
+                    break
+                # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方，如果是,则删除该span
+                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
+                        removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
+                    need_remove_spans.append(span)
+                    break
+
+        for span in need_remove_spans:
+            spans.remove(span)
+            span['tag'] = drop_tag
+            dropped_spans.append(span)
+
+    return spans, dropped_spans
+
+
+def adjust_bbox_for_standalone_block(spans):
+    # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
+    for sb_span in spans:
+        if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            for text_span in spans:
+                if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
+                    # 判断span2的纵向高度是否被span所覆盖
+                    if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
+                        # 判断span2是否在span左边
+                        if text_span['bbox'][0] < sb_span['bbox'][0]:
+                            # 调整span的y0和span2的y0一致
+                            sb_span['bbox'][1] = text_span['bbox'][1]
+    return spans
+
+
+def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
+    # displayed_list = []
+    # 如果spans为空,则不处理
+    if len(spans) == 0:
+        pass
+    else:
+        spans.sort(key=lambda span: span['bbox'][1])
+
+        lines = []
+        current_line = [spans[0]]
+        if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            displayed_list.append(spans[0])
+
+        line_first_y0 = spans[0]["bbox"][1]
+        line_first_y = spans[0]["bbox"][3]
+        # 用于给行间公式搜索
+        # text_inline_lines = []
+        for span in spans[1:]:
+            # if span.get("content","") == "78.":
+            #     print("debug")
+            # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
+            # image和table类型，同上
+            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
+                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
+                    current_line):
+                # 传入
+                if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+                    displayed_list.append(span)
+                # 则开始新行
+                lines.append(current_line)
+                if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                    text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+                continue
+
+            # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
+            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
+                if span["type"] == "text":
+                    line_first_y0 = span["bbox"][1]
+                    line_first_y = span["bbox"][3]
+                current_line.append(span)
+
+            else:
+                # 否则，开始新行
+                lines.append(current_line)
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                current_line = [span]
+                line_first_y0 = span["bbox"][1]
+                line_first_y = span["bbox"][3]
+
+            # 添加最后一行
+        if current_line:
+            lines.append(current_line)
+            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+                text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+        for line in text_inline_lines:
+            # 按照x0坐标排序
+            current_line = line[0]
+            current_line.sort(key=lambda span: span['bbox'][0])
+
+        # 调整每一个文字行内bbox统一
+        for line in text_inline_lines:
+            current_line, (line_first_y0, line_first_y) = line
+            for span in current_line:
+                span["bbox"][1] = line_first_y0
+                span["bbox"][3] = line_first_y
+
+        # return spans, displayed_list, text_inline_lines
+
+
+def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
+    # 错误行间公式转行内公式
+    j = 0
+    for i in range(len(displayed_list)):
+        # if i == 8:
+        #     print("debug")
+        span = displayed_list[i]
+        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+
+        while j < len(text_inline_lines):
+            text_line = text_inline_lines[j]
+            y0, y1 = text_line[1]
+            if (
+                    span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
+            ) and __is_overlaps_y_exceeds_threshold(
+                span['bbox'], (0, y0, 0, y1)
+            ):
+                # 调整公式类型
+                if span["type"] == ContentType.InterlineEquation:
+                    # 最后一行是行间公式
+                    if j + 1 >= len(text_inline_lines):
+                        span["type"] = ContentType.InlineEquation
+                        span["bbox"][1] = y0
+                        span["bbox"][3] = y1
+                    else:
+                        # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
+                        y0_next, y1_next = text_inline_lines[j + 1][1]
+                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
+                                y1 - y0) > span_y - span_y0:
+                            span["type"] = ContentType.InlineEquation
+                            span["bbox"][1] = y0
+                            span["bbox"][3] = y1
+                break
+            elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
+                                                                                                (0, y0, 0, y1)):
+                break
+            else:
+                j += 1
+
+    return spans
+
+
+def get_qa_need_list(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+    inline_equations = []
+
+    for block in blocks:
+        for line in block["lines"]:
+            for span in line["spans"]:
+                if span["type"] == ContentType.Image:
+                    images.append(span)
+                elif span["type"] == ContentType.Table:
+                    tables.append(span)
+                elif span["type"] == ContentType.InlineEquation:
+                    inline_equations.append(span)
+                elif span["type"] == ContentType.InterlineEquation:
+                    interline_equations.append(span)
+                else:
+                    continue
+    return images, tables, interline_equations, inline_equations
+
+
+def get_qa_need_list_v2(blocks):
+    # 创建 images, tables, interline_equations, inline_equations 的副本
+    images = []
+    tables = []
+    interline_equations = []
+
+    for block in blocks:
+        if block["type"] == BlockType.Image:
+            images.append(block)
+        elif block["type"] == BlockType.Table:
+            tables.append(block)
+        elif block["type"] == BlockType.InterlineEquation:
+            interline_equations.append(block)
+    return images, tables, interline_equations
diff --git a/magic_pdf/pre_proc/pdf_pre_filter.py b/magic_pdf/pre_proc/pdf_pre_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1704d0efbd5dadd25ae916d3dc7c14d6f2c4e04a
--- /dev/null
+++ b/magic_pdf/pre_proc/pdf_pre_filter.py
@@ -0,0 +1,74 @@
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
+from magic_pdf.libs.drop_reason import DropReason
+
+
+def __area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
+    """
+    检查page是包含有颜色背景的矩形
+    """
+    color_bg_rect = []
+    p_width, p_height = page.rect.width, page.rect.height
+    
+    # 先找到最大的带背景矩形
+    blocks = page.get_cdrawings()
+    for block in blocks:
+        
+        if 'fill' in block and block['fill']: # 过滤掉透明的
+            fill = list(block['fill'])
+            fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
+            if fill==(1.0,1.0,1.0):
+                continue
+            rect = block['rect']
+            # 过滤掉特别小的矩形
+            if __area(rect) < 10*10:
+                continue
+            # 为了防止是svg图片上的色块，这里过滤掉这类
+            
+            if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
+                continue
+            color_bg_rect.append(rect)
+            
+    # 找到最大的背景矩形
+    if len(color_bg_rect) > 0:
+        max_rect = max(color_bg_rect, key=lambda x:__area(x))
+        max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
+        # 判断最大的背景矩形是否包含超过3行文字，或者50个字 TODO
+        if max_rect[2]-max_rect[0] > 0.2*p_width and  max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
+            #看是否有文本块落入到这个矩形中
+            for text_block in text_blocks:
+                box = text_block['bbox']
+                box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
+                if _is_in(box_int, max_rect_int):
+                    return True
+    
+    return False
+
+
+def __is_table_overlap_text_block(text_blocks, table_bbox):
+    """
+    检查table_bbox是否覆盖了text_blocks里的文本块
+    TODO
+    """
+    for text_block in text_blocks:
+        box = text_block['bbox']
+        if _is_in_or_part_overlap(table_bbox, box):
+            return True
+    return False
+
+
+def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
+    """
+    return:(True|False, err_msg)
+        True, 如果pdf符合要求
+        False, 如果pdf不符合要求
+        
+    """
+    if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
+        return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
+
+    
+    return True, None
\ No newline at end of file
diff --git a/magic_pdf/pre_proc/post_layout_split.py b/magic_pdf/pre_proc/post_layout_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/pre_proc/remove_bbox_overlap.py b/magic_pdf/pre_proc/remove_bbox_overlap.py
new file mode 100644
index 0000000000000000000000000000000000000000..2afb07119afe91d412832b5a8766a513c6618560
--- /dev/null
+++ b/magic_pdf/pre_proc/remove_bbox_overlap.py
@@ -0,0 +1,98 @@
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
+from magic_pdf.libs.drop_reason import DropReason
+
+def _remove_overlap_between_bbox(bbox1, bbox2):
+   if _is_part_overlap(bbox1, bbox2):
+        ix0, iy0, ix1, iy1 = bbox1
+        x0, y0, x1, y1 = bbox2
+
+        diff_x = min(x1, ix1) - max(x0, ix0)
+        diff_y = min(y1, iy1) - max(y0, iy0)
+
+        if diff_y > diff_x:
+            if x1 >= ix1:
+                mid = (x0 + ix1) // 2
+                ix1 = min(mid - 0.25, ix1)
+                x0 = max(mid + 0.25, x0)
+            else:
+                mid = (ix0 + x1) // 2
+                ix0 = max(mid + 0.25, ix0)
+                x1 = min(mid - 0.25, x1)
+        else:
+            if y1 >= iy1:
+                mid = (y0 + iy1) // 2
+                y0 = max(mid + 0.25, y0)
+                iy1 = min(iy1, mid-0.25)
+            else:
+                mid = (iy0 + y1) // 2
+                y1 = min(y1, mid-0.25)
+                iy0 = max(mid + 0.25, iy0)
+
+        if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
+            bbox1 = [ix0, iy0, ix1, iy1]
+            bbox2 = [x0, y0, x1, y1]
+            return bbox1, bbox2, None
+        else:
+            return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
+   else:
+       return bbox1, bbox2, None
+
+
+def _remove_overlap_between_bboxes(arr):
+    drop_reasons = []
+    N = len(arr)
+    keeps = [True] * N
+    res = [None] * N
+    for i in range(N):
+        for j in range(N):
+            if i == j:
+                continue
+            if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
+                keeps[i] = False
+
+    for idx, v in enumerate(arr):
+        if not keeps[idx]:
+            continue
+        for i in range(N):
+            if res[i] is None:
+                continue
+        
+            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
+            if drop_reason is None:
+                v["bbox"] = bbox1
+                res[i]["bbox"] = bbox2
+            else:
+                if v["score"] > res[i]["score"]:
+                    keeps[i] = False
+                    res[i] = None
+                else:
+                    keeps[idx] = False
+                drop_reasons.append(drop_reasons)
+        if keeps[idx]:
+            res[idx] = v
+    return res, drop_reasons
+
+
+def remove_overlap_between_bbox_for_span(spans):
+    arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
+    res, drop_reasons = _remove_overlap_between_bboxes(arr)
+    ret = []
+    for i in range(len(res)):
+        if res[i] is None:
+            continue
+        spans[i]["bbox"] = res[i]["bbox"]
+        ret.append(spans[i])
+    return ret, drop_reasons
+
+
+def remove_overlap_between_bbox_for_block(all_bboxes):
+    arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
+    res, drop_reasons = _remove_overlap_between_bboxes(arr)
+    ret = []
+    for i in range(len(res)):
+        if res[i] is None:
+            continue
+        all_bboxes[i][:4] = res[i]["bbox"]
+        ret.append(all_bboxes[i])
+    return ret, drop_reasons
+
diff --git a/magic_pdf/pre_proc/remove_colored_strip_bbox.py b/magic_pdf/pre_proc/remove_colored_strip_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..17be73aa5010a4f179c2645d972d02afc7ab482b
--- /dev/null
+++ b/magic_pdf/pre_proc/remove_colored_strip_bbox.py
@@ -0,0 +1,79 @@
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
+from loguru import logger
+
+from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
+
+
+def __area(box):
+    return (box[2] - box[0]) * (box[3] - box[1])
+
+
+def rectangle_position_determination(rect, p_width):
+    """
+    判断矩形是否在页面中轴线附近。
+
+    Args:
+        rect (list): 矩形坐标，格式为[x1, y1, x2, y2]。
+        p_width (int): 页面宽度。
+
+    Returns:
+        bool: 若矩形在页面中轴线附近则返回True，否则返回False。
+    """
+    # 页面中轴线x坐标
+    x_axis = p_width / 2
+    # 矩形是否跨越中轴线
+    is_span = rect[0] < x_axis and rect[2] > x_axis
+    if is_span:
+        return True
+    else:
+        # 矩形与中轴线的距离，只算近的那一边
+        distance = rect[0] - x_axis if rect[0] > x_axis else x_axis - rect[2]
+        # 判断矩形与中轴线的距离是否小于页面宽度的20%
+        if distance < p_width * 0.2:
+            return True
+        else:
+            return False
+
+def remove_colored_strip_textblock(remain_text_blocks, page):
+    """
+    根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_strip_textblock。
+
+    Args:
+        remain_text_blocks (list): 剩余文本块列表。
+        page (Page): 页面对象。
+
+    Returns:
+        tuple: 剩余文本块列表和移除的文本块列表。
+    """
+    colored_strip_textblocks = []  # 先构造一个空的返回
+    if len(remain_text_blocks) > 0:
+        p_width, p_height = page.rect.width, page.rect.height
+        blocks = page.get_cdrawings()
+        colored_strip_bg_rect = []
+        for block in blocks:
+            is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)  # 过滤掉透明的
+            rect = block['rect']
+            area_is_large_enough = __area(rect) > 100  # 过滤掉特别小的矩形
+            rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
+            in_upper_half_page = rect[3] < p_height * 0.3  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
+            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4  # 找到长宽比超过4的矩形
+
+            if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
+                colored_strip_bg_rect.append(rect)
+
+        if len(colored_strip_bg_rect) > 0:
+            for colored_strip_block_bbox in colored_strip_bg_rect:
+                for text_block in remain_text_blocks:
+                    text_bbox = text_block['bbox']
+                    if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
+                        logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
+                        text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
+                        colored_strip_textblocks.append(text_block)
+
+                if len(colored_strip_textblocks) > 0:
+                    for colored_strip_textblock in colored_strip_textblocks:
+                        if colored_strip_textblock in remain_text_blocks:
+                            remain_text_blocks.remove(colored_strip_textblock)
+
+    return remain_text_blocks, colored_strip_textblocks
+
diff --git a/magic_pdf/pre_proc/remove_footer_header.py b/magic_pdf/pre_proc/remove_footer_header.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e04817c3ea2414c21f3631c6deaa8ad4fc7ff9e
--- /dev/null
+++ b/magic_pdf/pre_proc/remove_footer_header.py
@@ -0,0 +1,117 @@
+import re
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
+
+
+def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
+                                   page_no_bboxs, page_w, page_h):
+    """
+    删除页眉页脚，页码
+    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
+    """
+    header = []
+    footer = []
+    if len(header) == 0:
+        model_header = header_bboxs
+        if model_header:
+            x0 = min([x for x, _, _, _ in model_header])
+            y0 = min([y for _, y, _, _ in model_header])
+            x1 = max([x1 for _, _, x1, _ in model_header])
+            y1 = max([y1 for _, _, _, y1 in model_header])
+            header = [x0, y0, x1, y1]
+    if len(footer) == 0:
+        model_footer = footer_bboxs
+        if model_footer:
+            x0 = min([x for x, _, _, _ in model_footer])
+            y0 = min([y for _, y, _, _ in model_footer])
+            x1 = max([x1 for _, _, x1, _ in model_footer])
+            y1 = max([y1 for _, _, _, y1 in model_footer])
+            footer = [x0, y0, x1, y1]
+
+    header_y0 = 0 if len(header) == 0 else header[3]
+    footer_y0 = page_h if len(footer) == 0 else footer[1]
+    if page_no_bboxs:
+        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
+        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
+
+        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
+        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
+
+        header_y0 = max(header_y0, top_max_y0)
+        footer_y0 = min(footer_y0, btn_min_y1)
+
+    content_boundry = [0, header_y0, page_w, footer_y0]
+
+    header = [0, 0, page_w, header_y0]
+    footer = [0, footer_y0, page_w, page_h]
+
+    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
+    text_block_to_remove = []
+    # 首先检查每个textblock
+    for blk in text_raw_blocks:
+        if len(blk['lines']) > 0:
+            for line in blk['lines']:
+                line_del = []
+                for span in line['spans']:
+                    span_del = []
+                    if span['bbox'][3] < header_y0:
+                        span_del.append(span)
+                    elif _is_in_or_part_overlap(span['bbox'], header) or _is_in_or_part_overlap(span['bbox'], footer):
+                        span_del.append(span)
+                for span in span_del:
+                    line['spans'].remove(span)
+                if not line['spans']:
+                    line_del.append(line)
+
+            for line in line_del:
+                blk['lines'].remove(line)
+        else:
+            # if not blk['lines']:
+            blk['tag'] = CONTENT_IN_FOOT_OR_HEADER
+            text_block_to_remove.append(blk)
+
+    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
+    page_no_block_2_remove = []
+    if page_no_bboxs:
+        for pagenobox in page_no_bboxs:
+            for block in text_raw_blocks:
+                if _is_in_or_part_overlap(pagenobox, block['bbox']):  # 在span级别删除页码
+                    for line in block['lines']:
+                        for span in line['spans']:
+                            if _is_in_or_part_overlap(pagenobox, span['bbox']):
+                                # span['text'] = ''
+                                span['tag'] = PAGE_NO
+                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
+                                if len(line['spans']) == 1 and len(block['lines']) == 1:
+                                    page_no_block_2_remove.append(block)
+    else:
+        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
+        if len(text_raw_blocks) > 0:
+            text_raw_blocks.sort(key=lambda x: x['bbox'][1], reverse=True)
+            last_block = text_raw_blocks[0]
+            if len(last_block['lines']) == 1:
+                last_line = last_block['lines'][0]
+                if len(last_line['spans']) == 1:
+                    last_span = last_line['spans'][0]
+                    if last_span['text'].strip() and not re.search('[a-zA-Z]', last_span['text']) and re.search('[0-9]',
+                                                                                                                last_span[
+                                                                                                                    'text']):
+                        last_span['tag'] = PAGE_NO
+                        page_no_block_2_remove.append(last_block)
+
+    for b in page_no_block_2_remove:
+        text_block_to_remove.append(b)
+
+    for blk in text_block_to_remove:
+        if blk in text_raw_blocks:
+            text_raw_blocks.remove(blk)
+
+    text_block_remain = text_raw_blocks
+    image_bbox_to_remove = [bbox for bbox in image_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+
+    image_bbox_remain = [bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_to_remove = [bbox for bbox in table_bboxes if not _is_in_or_part_overlap(bbox, content_boundry)]
+    table_bbox_remain = [bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)]
+
+    return image_bbox_remain, table_bbox_remain, text_block_remain, text_block_to_remove, image_bbox_to_remove, table_bbox_to_remove
diff --git a/magic_pdf/pre_proc/remove_rotate_bbox.py b/magic_pdf/pre_proc/remove_rotate_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..d47aa51c52ae51cc3180478ffcfdc272e47785ec
--- /dev/null
+++ b/magic_pdf/pre_proc/remove_rotate_bbox.py
@@ -0,0 +1,188 @@
+import math
+
+from magic_pdf.libs.boxbase import is_vbox_on_side
+from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
+
+
+def detect_non_horizontal_texts(result_dict):
+    """
+    This function detects watermarks and vertical margin notes in the document.
+
+    Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+    If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
+    If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
+
+    Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
+    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+    If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
+
+
+    Parameters
+    ----------
+    result_dict : dict
+        The result dictionary.
+
+    Returns
+    -------
+    result_dict : dict
+        The updated result dictionary.
+    """
+    # Dictionary to store information about potential watermarks
+    potential_watermarks = {}
+    potential_margin_notes = {}
+
+    for page_id, page_content in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_id, block_data in page_content.items():
+                if block_id.startswith("block_"):
+                    if "dir" in block_data:
+                        coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
+
+                        angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                        angle = abs(math.degrees(angle))
+
+                        if angle > 5 and angle < 85:  # Check if direction is watermarks
+                            if coordinates_text in potential_watermarks:
+                                potential_watermarks[coordinates_text] += 1
+                            else:
+                                potential_watermarks[coordinates_text] = 1
+
+                        if angle > 85 and angle < 105:  # Check if direction is vertical
+                            if coordinates_text in potential_margin_notes:
+                                potential_margin_notes[coordinates_text] += 1  # Increment count
+                            else:
+                                potential_margin_notes[coordinates_text] = 1  # Initialize count
+
+    # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+    watermark_threshold = len(result_dict) // 2
+    watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+
+    # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
+    margin_note_threshold = len(result_dict) // 2
+    margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+
+    # Add watermark information to the result dictionary
+    for page_id, blocks in result_dict.items():
+        if page_id.startswith("page_"):
+            for block_id, block_data in blocks.items():
+                coordinates_text = (block_data["bbox"], block_data["text"])
+                if coordinates_text in watermarks:
+                    block_data["is_watermark"] = 1
+                else:
+                    block_data["is_watermark"] = 0
+
+                if coordinates_text in margin_notes:
+                    block_data["is_vertical_margin_note"] = 1
+                else:
+                    block_data["is_vertical_margin_note"] = 0
+
+    return result_dict
+
+
+"""
+1. 当一个block里全部文字都不是dir=(1,0)，这个block整体去掉
+2. 当一个block里全部文字都是dir=(1,0)，但是每行只有一个字，这个block整体去掉。这个block必须出现在页面的四周，否则不去掉
+"""
+import re
+
+def __is_a_word(sentence):
+    # 如果输入是中文并且长度为1，则返回True
+    if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
+        return True
+    # 判断是否为单个英文单词或字符（包括ASCII标点）
+    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
+        return True
+    else:
+        return False
+
+
+def __get_text_color(num):
+    """获取字体的颜色RGB值"""
+    blue = num & 255
+    green = (num >> 8) & 255
+    red = (num >> 16) & 255
+    return red, green, blue
+
+
+def __is_empty_side_box(text_block):
+    """
+    是否是边缘上的空白没有任何内容的block
+    """
+    for line in text_block['lines']:
+        for span in line['spans']:
+            font_color = span['color']
+            r,g,b = __get_text_color(font_color)
+            if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
+                return False
+            
+    return True
+
+
+def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
+    """
+    返回删除了垂直，水印，旋转的textblock
+    删除的内容打上tag返回
+    """
+    removed_text_block = []
+    
+    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+        lines = block['lines']
+        block_bbox = block['bbox']
+        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
+           continue
+        
+        if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
+            is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1)  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
+            
+            if is_box_valign:
+                block['tag'] = VERTICAL_TEXT
+                removed_text_block.append(block)
+                continue
+        
+        for line in lines:
+            if line['dir']!=(1,0):
+                block['tag'] = ROTATE_TEXT
+                removed_text_block.append(block) # 只要有一个line不是dir=(1,0)，就把整个block都删掉
+                break
+        
+    for block in removed_text_block:
+        pymu_text_block.remove(block)
+    
+    return pymu_text_block, removed_text_block
+
+def get_side_boundry(rotate_bbox, page_width, page_height):
+    """
+    根据rotate_bbox，返回页面的左右正文边界
+    """
+    left_x = 0
+    right_x = page_width
+    for x in rotate_bbox:
+        box = x['bbox']
+        if box[2]<page_width/2:
+            left_x = max(left_x, box[2])
+        else:
+            right_x = min(right_x, box[0])
+            
+    return left_x+1, right_x-1
+
+
+def remove_side_blank_block(pymu_text_block, page_width, page_height):
+    """
+    删除页面两侧的空白block
+    """
+    removed_text_block = []
+    
+    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+        block_bbox = block['bbox']
+        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
+           continue
+            
+        if __is_empty_side_box(block):
+            block['tag'] = EMPTY_SIDE_BLOCK
+            removed_text_block.append(block)
+            continue
+        
+    for block in removed_text_block:
+        pymu_text_block.remove(block)
+    
+    return pymu_text_block, removed_text_block
\ No newline at end of file
diff --git a/magic_pdf/pre_proc/resolve_bbox_conflict.py b/magic_pdf/pre_proc/resolve_bbox_conflict.py
new file mode 100644
index 0000000000000000000000000000000000000000..6808828466bfe2960834a2562569379106097028
--- /dev/null
+++ b/magic_pdf/pre_proc/resolve_bbox_conflict.py
@@ -0,0 +1,191 @@
+"""
+从pdf里提取出来api给出的bbox,然后根据重叠情况做出取舍
+1. 首先去掉出现在图片上的bbox，图片包括表格和图片
+2. 然后去掉出现在文字blcok上的图片bbox
+"""
+
+from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
+from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
+
+
+def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
+                                  text_raw_blocks: list):
+    """
+    text_raw_blocks结构是从pymupdf里直接取到的结构，具体样例参考test/assets/papre/pymu_textblocks.json
+    当下采用一种粗暴的方式：
+    1. 去掉图片上的公式
+    2. 去掉table上的公式
+    2. 图片和文字block部分重叠，首先丢弃图片
+    3. 图片和图片重叠，修改图片的bbox，使得图片不重叠(暂时没这么做，先把图片都扔掉)
+    4. 去掉文字bbox里位于图片、表格上的文字（一定要完全在图、表内部）
+    5. 去掉表格上的文字
+    """
+    text_block_removed = []
+    images_backup = []
+
+    # 去掉位于图片上的文字block
+    for image_box in images:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in(text_bbox, image_box):
+                text_block['tag'] = ON_IMAGE_TEXT
+                text_block_removed.append(text_block)
+    # 去掉table上的文字block
+    for table_box in tables:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in(text_bbox, table_box):
+                text_block['tag'] = ON_TABLE_TEXT
+                text_block_removed.append(text_block)
+
+    for text_block in text_block_removed:
+        if text_block in text_raw_blocks:
+            text_raw_blocks.remove(text_block)
+
+    # 第一步去掉在图片上出现的公式box
+    temp = []
+    for image_box in images:
+        for eq1 in interline_equations:
+            if _is_in_or_part_overlap(image_box, eq1[:4]):
+                temp.append(eq1)
+        for eq2 in inline_equations:
+            if _is_in_or_part_overlap(image_box, eq2[:4]):
+                temp.append(eq2)
+
+    for eq in temp:
+        if eq in interline_equations:
+            interline_equations.remove(eq)
+        if eq in inline_equations:
+            inline_equations.remove(eq)
+
+    # 第二步去掉在表格上出现的公式box
+    temp = []
+    for table_box in tables:
+        for eq1 in interline_equations:
+            if _is_in_or_part_overlap(table_box, eq1[:4]):
+                temp.append(eq1)
+        for eq2 in inline_equations:
+            if _is_in_or_part_overlap(table_box, eq2[:4]):
+                temp.append(eq2)
+
+    for eq in temp:
+        if eq in interline_equations:
+            interline_equations.remove(eq)
+        if eq in inline_equations:
+            inline_equations.remove(eq)
+
+    # 图片和文字重叠，丢掉图片
+    for image_box in images:
+        for text_block in text_raw_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_in_or_part_overlap(image_box, text_bbox):
+                images_backup.append(image_box)
+                break
+    for image_box in images_backup:
+        images.remove(image_box)
+
+    # 图片和图片重叠，两张都暂时不参与版面计算
+    images_dup_index = []
+    for i in range(len(images)):
+        for j in range(i + 1, len(images)):
+            if _is_in_or_part_overlap(images[i], images[j]):
+                images_dup_index.append(i)
+                images_dup_index.append(j)
+
+    dup_idx = set(images_dup_index)
+    for img_id in dup_idx:
+        images_backup.append(images[img_id])
+        images[img_id] = None
+
+    images = [img for img in images if img is not None]
+
+    # 如果行间公式和文字block重叠，放到临时的数据里，防止这些文字box影响到layout计算。通过计算IOU合并行间公式和文字block
+    # 对于这样的文本块删除，然后保留行间公式的大小不变。
+    # 当计算完毕layout，这部分再合并回来
+    text_block_removed_2 = []
+    # for text_block in text_raw_blocks:
+    #     text_bbox = text_block["bbox"]
+    #     for eq in interline_equations:
+    #         ratio = calculate_overlap_area_2_minbox_area_ratio(text_bbox, eq[:4])
+    #         if ratio>0.05:
+    #             text_block['tag'] = "belong-to-interline-equation"
+    #             text_block_removed_2.append(text_block)
+    #             break
+
+    # for tb in text_block_removed_2:
+    #     if tb in text_raw_blocks:
+    #         text_raw_blocks.remove(tb)
+
+    # text_block_removed = text_block_removed + text_block_removed_2
+
+    return images, tables, interline_equations, inline_equations, text_raw_blocks, text_block_removed, images_backup, text_block_removed_2
+
+
+def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
+    """
+    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
+    因为这种情况大概率发生了公式没有被检测出来。
+    
+    """
+    if len(text_blocks) == 0:
+        return False
+
+    page_min_y = 0
+    page_max_y = max(yy['bbox'][3] for yy in text_blocks)
+
+    def __max_y(lst: list):
+        if len(lst) > 0:
+            return max([item[1] for item in lst])
+        return page_min_y
+
+    def __min_y(lst: list):
+        if len(lst) > 0:
+            return min([item[3] for item in lst])
+        return page_max_y
+
+    clip_y0 = __max_y(header)
+    clip_y1 = __min_y(footer)
+
+    txt_bboxes = []
+    for text_block in text_blocks:
+        bbox = text_block["bbox"]
+        if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
+            txt_bboxes.append(bbox)
+
+    for i in range(len(txt_bboxes)):
+        for j in range(i + 1, len(txt_bboxes)):
+            if _is_left_overlap(txt_bboxes[i], txt_bboxes[j]) or _is_left_overlap(txt_bboxes[j], txt_bboxes[i]):
+                return True
+
+    return False
+
+
+def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
+    """
+    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
+    因为这种情况大概率发生了公式没有被检测出来。
+
+    """
+    if len(useful_blocks) == 0:
+        return False
+
+    page_min_y = 0
+    page_max_y = max(yy['bbox'][3] for yy in useful_blocks)
+
+    useful_bboxes = []
+    for text_block in useful_blocks:
+        bbox = text_block["bbox"]
+        if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
+            useful_bboxes.append(bbox)
+
+    for i in range(len(useful_bboxes)):
+        for j in range(i + 1, len(useful_bboxes)):
+            area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
+            area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
+            if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
+                if area_i > area_j:
+                    return True, useful_bboxes[j], useful_bboxes[i]
+                else:
+                    return True, useful_bboxes[i], useful_bboxes[j]
+
+    return False, None, None
diff --git a/magic_pdf/pre_proc/solve_line_alien.py b/magic_pdf/pre_proc/solve_line_alien.py
new file mode 100644
index 0000000000000000000000000000000000000000..966fb89ead2abb454cf940c7941c3d3d65a532a6
--- /dev/null
+++ b/magic_pdf/pre_proc/solve_line_alien.py
@@ -0,0 +1,29 @@
+def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict:  # text_block -> json中的preproc_block
+    """解决行内文本间距过大问题"""
+    for i in range(len(pdf_info_dict)):
+
+        text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
+
+        for block in text_blocks:
+
+            x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
+            
+            for line in block['lines']:
+
+                x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
+                # line_box = [x1, y1, x2, y2] 
+                if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
+                    # if len(line['spans']) == 1:
+                    line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
+                
+                x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox'] 
+
+    return pdf_info_dict
+
+
+
+
+
+
+
+
diff --git a/magic_pdf/pre_proc/statistics.py b/magic_pdf/pre_proc/statistics.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf7c78fe4fb02c0036f25c463720361f905dffd
--- /dev/null
+++ b/magic_pdf/pre_proc/statistics.py
@@ -0,0 +1,12 @@
+
+"""
+统计处需要跨页、全局性的数据
+- 统计出字号从大到小
+- 正文区域占比最高的前5
+- 正文平均行间距
+- 正文平均字间距
+- 正文平均字符宽度
+- 正文平均字符高度
+
+"""
+
diff --git a/magic_pdf/resources/model_config/UniMERNet/demo.yaml b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c0db3064997623fe5501411ca5de20cccde6a405
--- /dev/null
+++ b/magic_pdf/resources/model_config/UniMERNet/demo.yaml
@@ -0,0 +1,46 @@
+model:
+  arch: unimernet
+  model_type: unimernet
+  model_config:
+    model_name: ./models
+    max_seq_len: 1024
+    length_aware: False
+  load_pretrained: True
+  pretrained: ./models/pytorch_model.bin
+  tokenizer_config:
+    path: ./models
+
+datasets:
+  formula_rec_eval:
+    vis_processor:
+      eval:
+        name: "formula_image_eval"
+        image_size:
+          - 192
+          - 672
+   
+run:
+  runner: runner_iter
+  task: unimernet_train
+
+  batch_size_train: 64
+  batch_size_eval: 64
+  num_workers: 1
+
+  iters_per_inner_epoch: 2000
+  max_iters: 60000
+
+  seed: 42
+  output_dir: "../output/demo"
+
+  evaluate: True
+  test_splits: [ "eval" ]
+
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True
+  distributed_type: ddp  # or fsdp when train llm
+
+  generate_cfg:
+    temperature: 0.0
diff --git a/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6f47cb9cde0c59d38ea1a736c4c6d7ad9efddaa3
--- /dev/null
+++ b/magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml
@@ -0,0 +1,351 @@
+AUG:
+  DETR: true
+CACHE_DIR: /mnt/localdata/users/yupanhuang/cache/huggingface
+CUDNN_BENCHMARK: false
+DATALOADER:
+  ASPECT_RATIO_GROUPING: true
+  FILTER_EMPTY_ANNOTATIONS: false
+  NUM_WORKERS: 4
+  REPEAT_THRESHOLD: 0.0
+  SAMPLER_TRAIN: TrainingSampler
+DATASETS:
+  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
+  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
+  PROPOSAL_FILES_TEST: []
+  PROPOSAL_FILES_TRAIN: []
+  TEST:
+  - scihub_train
+  TRAIN:
+  - scihub_train
+GLOBAL:
+  HACK: 1.0
+ICDAR_DATA_DIR_TEST: ''
+ICDAR_DATA_DIR_TRAIN: ''
+INPUT:
+  CROP:
+    ENABLED: true
+    SIZE:
+    - 384
+    - 600
+    TYPE: absolute_range
+  FORMAT: RGB
+  MASK_FORMAT: polygon
+  MAX_SIZE_TEST: 1333
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MIN_SIZE_TRAIN:
+  - 480
+  - 512
+  - 544
+  - 576
+  - 608
+  - 640
+  - 672
+  - 704
+  - 736
+  - 768
+  - 800
+  MIN_SIZE_TRAIN_SAMPLING: choice
+  RANDOM_FLIP: horizontal
+MODEL:
+  ANCHOR_GENERATOR:
+    ANGLES:
+    - - -90
+      - 0
+      - 90
+    ASPECT_RATIOS:
+    - - 0.5
+      - 1.0
+      - 2.0
+    NAME: DefaultAnchorGenerator
+    OFFSET: 0.0
+    SIZES:
+    - - 32
+    - - 64
+    - - 128
+    - - 256
+    - - 512
+  BACKBONE:
+    FREEZE_AT: 2
+    NAME: build_vit_fpn_backbone
+  CONFIG_PATH: ''
+  DEVICE: cuda
+  FPN:
+    FUSE_TYPE: sum
+    IN_FEATURES:
+    - layer3
+    - layer5
+    - layer7
+    - layer11
+    NORM: ''
+    OUT_CHANNELS: 256
+  IMAGE_ONLY: true
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  MASK_ON: true
+  META_ARCHITECTURE: VLGeneralizedRCNN
+  PANOPTIC_FPN:
+    COMBINE:
+      ENABLED: true
+      INSTANCES_CONFIDENCE_THRESH: 0.5
+      OVERLAP_THRESH: 0.5
+      STUFF_AREA_LIMIT: 4096
+    INSTANCE_LOSS_WEIGHT: 1.0
+  PIXEL_MEAN:
+  - 127.5
+  - 127.5
+  - 127.5
+  PIXEL_STD:
+  - 127.5
+  - 127.5
+  - 127.5
+  PROPOSAL_GENERATOR:
+    MIN_SIZE: 0
+    NAME: RPN
+  RESNETS:
+    DEFORM_MODULATED: false
+    DEFORM_NUM_GROUPS: 1
+    DEFORM_ON_PER_STAGE:
+    - false
+    - false
+    - false
+    - false
+    DEPTH: 50
+    NORM: FrozenBN
+    NUM_GROUPS: 1
+    OUT_FEATURES:
+    - res4
+    RES2_OUT_CHANNELS: 256
+    RES5_DILATION: 1
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: true
+    WIDTH_PER_GROUP: 64
+  RETINANET:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_WEIGHTS:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    FOCAL_LOSS_ALPHA: 0.25
+    FOCAL_LOSS_GAMMA: 2.0
+    IN_FEATURES:
+    - p3
+    - p4
+    - p5
+    - p6
+    - p7
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.4
+    - 0.5
+    NMS_THRESH_TEST: 0.5
+    NORM: ''
+    NUM_CLASSES: 10
+    NUM_CONVS: 4
+    PRIOR_PROB: 0.01
+    SCORE_THRESH_TEST: 0.05
+    SMOOTH_L1_LOSS_BETA: 0.1
+    TOPK_CANDIDATES_TEST: 1000
+  ROI_BOX_CASCADE_HEAD:
+    BBOX_REG_WEIGHTS:
+    - - 10.0
+      - 10.0
+      - 5.0
+      - 5.0
+    - - 20.0
+      - 20.0
+      - 10.0
+      - 10.0
+    - - 30.0
+      - 30.0
+      - 15.0
+      - 15.0
+    IOUS:
+    - 0.5
+    - 0.6
+    - 0.7
+  ROI_BOX_HEAD:
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 10.0
+    - 10.0
+    - 5.0
+    - 5.0
+    CLS_AGNOSTIC_BBOX_REG: true
+    CONV_DIM: 256
+    FC_DIM: 1024
+    NAME: FastRCNNConvFCHead
+    NORM: ''
+    NUM_CONV: 0
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+    SMOOTH_L1_BETA: 0.0
+    TRAIN_ON_PRED_BOXES: false
+  ROI_HEADS:
+    BATCH_SIZE_PER_IMAGE: 512
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    IOU_LABELS:
+    - 0
+    - 1
+    IOU_THRESHOLDS:
+    - 0.5
+    NAME: CascadeROIHeads
+    NMS_THRESH_TEST: 0.5
+    NUM_CLASSES: 10
+    POSITIVE_FRACTION: 0.25
+    PROPOSAL_APPEND_GT: true
+    SCORE_THRESH_TEST: 0.05
+  ROI_KEYPOINT_HEAD:
+    CONV_DIMS:
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    - 512
+    LOSS_WEIGHT: 1.0
+    MIN_KEYPOINTS_PER_IMAGE: 1
+    NAME: KRCNNConvDeconvUpsampleHead
+    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true
+    NUM_KEYPOINTS: 17
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  ROI_MASK_HEAD:
+    CLS_AGNOSTIC_MASK: false
+    CONV_DIM: 256
+    NAME: MaskRCNNConvUpsampleHead
+    NORM: ''
+    NUM_CONV: 4
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 0
+    POOLER_TYPE: ROIAlignV2
+  RPN:
+    BATCH_SIZE_PER_IMAGE: 256
+    BBOX_REG_LOSS_TYPE: smooth_l1
+    BBOX_REG_LOSS_WEIGHT: 1.0
+    BBOX_REG_WEIGHTS:
+    - 1.0
+    - 1.0
+    - 1.0
+    - 1.0
+    BOUNDARY_THRESH: -1
+    CONV_DIMS:
+    - -1
+    HEAD_NAME: StandardRPNHead
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    - p6
+    IOU_LABELS:
+    - 0
+    - -1
+    - 1
+    IOU_THRESHOLDS:
+    - 0.3
+    - 0.7
+    LOSS_WEIGHT: 1.0
+    NMS_THRESH: 0.7
+    POSITIVE_FRACTION: 0.5
+    POST_NMS_TOPK_TEST: 1000
+    POST_NMS_TOPK_TRAIN: 2000
+    PRE_NMS_TOPK_TEST: 1000
+    PRE_NMS_TOPK_TRAIN: 2000
+    SMOOTH_L1_BETA: 0.0
+  SEM_SEG_HEAD:
+    COMMON_STRIDE: 4
+    CONVS_DIM: 128
+    IGNORE_VALUE: 255
+    IN_FEATURES:
+    - p2
+    - p3
+    - p4
+    - p5
+    LOSS_WEIGHT: 1.0
+    NAME: SemSegFPNHead
+    NORM: GN
+    NUM_CLASSES: 10
+  VIT:
+    DROP_PATH: 0.1
+    IMG_SIZE:
+    - 224
+    - 224
+    NAME: layoutlmv3_base
+    OUT_FEATURES:
+    - layer3
+    - layer5
+    - layer7
+    - layer11
+    POS_TYPE: abs
+  WEIGHTS: 
+OUTPUT_DIR: 
+SCIHUB_DATA_DIR_TRAIN: /mnt/petrelfs/share_data/zhaozhiyuan/publaynet/layout_scihub/train
+SEED: 42
+SOLVER:
+  AMP:
+    ENABLED: true
+  BACKBONE_MULTIPLIER: 1.0
+  BASE_LR: 0.0002
+  BIAS_LR_FACTOR: 1.0
+  CHECKPOINT_PERIOD: 2000
+  CLIP_GRADIENTS:
+    CLIP_TYPE: full_model
+    CLIP_VALUE: 1.0
+    ENABLED: true
+    NORM_TYPE: 2.0
+  GAMMA: 0.1
+  GRADIENT_ACCUMULATION_STEPS: 1
+  IMS_PER_BATCH: 32
+  LR_SCHEDULER_NAME: WarmupCosineLR
+  MAX_ITER: 20000
+  MOMENTUM: 0.9
+  NESTEROV: false
+  OPTIMIZER: ADAMW
+  REFERENCE_WORLD_SIZE: 0
+  STEPS:
+  - 10000
+  WARMUP_FACTOR: 0.01
+  WARMUP_ITERS: 333
+  WARMUP_METHOD: linear
+  WEIGHT_DECAY: 0.05
+  WEIGHT_DECAY_BIAS: null
+  WEIGHT_DECAY_NORM: 0.0
+TEST:
+  AUG:
+    ENABLED: false
+    FLIP: true
+    MAX_SIZE: 4000
+    MIN_SIZES:
+    - 400
+    - 500
+    - 600
+    - 700
+    - 800
+    - 900
+    - 1000
+    - 1100
+    - 1200
+  DETECTIONS_PER_IMAGE: 100
+  EVAL_PERIOD: 1000
+  EXPECTED_RESULTS: []
+  KEYPOINT_OKS_SIGMAS: []
+  PRECISE_BN:
+    ENABLED: false
+    NUM_ITER: 200
+VERSION: 2
+VIS_PERIOD: 0
diff --git a/magic_pdf/resources/model_config/model_configs.yaml b/magic_pdf/resources/model_config/model_configs.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..44cc8889178ce9a624c2ec92921640a553295ef1
--- /dev/null
+++ b/magic_pdf/resources/model_config/model_configs.yaml
@@ -0,0 +1,9 @@
+config:
+  device: cpu
+  layout: True
+  formula: True
+
+weights:
+  layout: Layout/model_final.pth
+  mfd: MFD/weights.pt
+  mfr: MFR/UniMERNet
diff --git a/magic_pdf/rw/AbsReaderWriter.py b/magic_pdf/rw/AbsReaderWriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce5e0774f107d9c035263fad4ccb5a4d2b567ed6
--- /dev/null
+++ b/magic_pdf/rw/AbsReaderWriter.py
@@ -0,0 +1,34 @@
+from abc import ABC, abstractmethod
+
+
+class AbsReaderWriter(ABC):
+    """
+    同时支持二进制和文本读写的抽象类
+    """
+    MODE_TXT = "text"
+    MODE_BIN = "binary"
+
+    def __init__(self, parent_path):
+        # 初始化代码可以在这里添加，如果需要的话
+        self.parent_path = parent_path # 对于本地目录是父目录，对于s3是会写到这个path下。
+
+    @abstractmethod
+    def read(self, path: str, mode=MODE_TXT):
+        """
+        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def write(self, content: str, path: str, mode=MODE_TXT):
+        """
+        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
+        """
+        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
+        """
+        raise NotImplementedError
diff --git a/magic_pdf/rw/DiskReaderWriter.py b/magic_pdf/rw/DiskReaderWriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e034c80403fc6206d9fae3b6c09bccf10b6deb8
--- /dev/null
+++ b/magic_pdf/rw/DiskReaderWriter.py
@@ -0,0 +1,66 @@
+import os
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from loguru import logger
+
+
+MODE_TXT = "text"
+MODE_BIN = "binary"
+
+
+class DiskReaderWriter(AbsReaderWriter):
+
+    def __init__(self, parent_path, encoding="utf-8"):
+        self.path = parent_path
+        self.encoding = encoding
+
+    def read(self, path, mode=MODE_TXT):
+        if os.path.isabs(path):
+            abspath = path
+        else:
+            abspath = os.path.join(self.path, path)
+        if not os.path.exists(abspath):
+            logger.error(f"文件 {abspath} 不存在")
+            raise Exception(f"文件 {abspath} 不存在")
+        if mode == MODE_TXT:
+            with open(abspath, "r", encoding=self.encoding) as f:
+                return f.read()
+        elif mode == MODE_BIN:
+            with open(abspath, "rb") as f:
+                return f.read()
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+
+    def write(self, content, path, mode=MODE_TXT):
+        if os.path.isabs(path):
+            abspath = path
+        else:
+            abspath = os.path.join(self.path, path)
+        directory_path = os.path.dirname(abspath)
+        if not os.path.exists(directory_path):
+            os.makedirs(directory_path)
+        if mode == MODE_TXT:
+            with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
+                f.write(content)
+
+        elif mode == MODE_BIN:
+            with open(abspath, "wb") as f:
+                f.write(content)
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+
+    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
+        return self.read(path)
+
+
+# 使用示例
+if __name__ == "__main__":
+    file_path = "io/test/example.txt"
+    drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
+
+    # 写入内容到文件
+    drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
+
+    # 从文件读取内容
+    content = drw.read(path=file_path)
+    if content:
+        logger.info(f"从 {file_path} 读取的内容: {content}")
diff --git a/magic_pdf/rw/S3ReaderWriter.py b/magic_pdf/rw/S3ReaderWriter.py
new file mode 100644
index 0000000000000000000000000000000000000000..df394a1e053884f030098a950f26e322f2d0da4c
--- /dev/null
+++ b/magic_pdf/rw/S3ReaderWriter.py
@@ -0,0 +1,107 @@
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.libs.commons import parse_aws_param, parse_bucket_key, join_path
+import boto3
+from loguru import logger
+from boto3.s3.transfer import TransferConfig
+from botocore.config import Config
+import os
+
+MODE_TXT = "text"
+MODE_BIN = "binary"
+
+
+class S3ReaderWriter(AbsReaderWriter):
+    def __init__(self, ak: str, sk: str, endpoint_url: str, addressing_style: str = 'auto', parent_path: str = ''):
+        self.client = self._get_client(ak, sk, endpoint_url, addressing_style)
+        self.path = parent_path
+
+    def _get_client(self, ak: str, sk: str, endpoint_url: str, addressing_style: str):
+        s3_client = boto3.client(
+            service_name="s3",
+            aws_access_key_id=ak,
+            aws_secret_access_key=sk,
+            endpoint_url=endpoint_url,
+            config=Config(s3={"addressing_style": addressing_style},
+                          retries={'max_attempts': 5, 'mode': 'standard'}),
+        )
+        return s3_client
+
+    def read(self, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
+        if s3_relative_path.startswith("s3://"):
+            s3_path = s3_relative_path
+        else:
+            s3_path = join_path(self.path, s3_relative_path)
+        bucket_name, key = parse_bucket_key(s3_path)
+        res = self.client.get_object(Bucket=bucket_name, Key=key)
+        body = res["Body"].read()
+        if mode == MODE_TXT:
+            data = body.decode(encoding)  # Decode bytes to text
+        elif mode == MODE_BIN:
+            data = body
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        return data
+
+    def write(self, content, s3_relative_path, mode=MODE_TXT, encoding="utf-8"):
+        if s3_relative_path.startswith("s3://"):
+            s3_path = s3_relative_path
+        else:
+            s3_path = join_path(self.path, s3_relative_path)
+        if mode == MODE_TXT:
+            body = content.encode(encoding)  # Encode text data as bytes
+        elif mode == MODE_BIN:
+            body = content
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        bucket_name, key = parse_bucket_key(s3_path)
+        self.client.put_object(Body=body, Bucket=bucket_name, Key=key)
+        logger.info(f"内容已写入 {s3_path} ")
+
+    def read_jsonl(self, path: str, byte_start=0, byte_end=None, mode=MODE_TXT, encoding='utf-8'):
+        if path.startswith("s3://"):
+            s3_path = path
+        else:
+            s3_path = join_path(self.path, path)
+        bucket_name, key = parse_bucket_key(s3_path)
+
+        range_header = f'bytes={byte_start}-{byte_end}' if byte_end else f'bytes={byte_start}-'
+        res = self.client.get_object(Bucket=bucket_name, Key=key, Range=range_header)
+        body = res["Body"].read()
+        if mode == MODE_TXT:
+            data = body.decode(encoding)  # Decode bytes to text
+        elif mode == MODE_BIN:
+            data = body
+        else:
+            raise ValueError("Invalid mode. Use 'text' or 'binary'.")
+        return data
+
+
+if __name__ == "__main__":
+    # Config the connection info
+    ak = ""
+    sk = ""
+    endpoint_url = ""
+    addressing_style = "auto"
+    bucket_name = ""
+    # Create an S3ReaderWriter object
+    s3_reader_writer = S3ReaderWriter(ak, sk, endpoint_url, addressing_style, "s3://bucket_name/")
+
+    # Write text data to S3
+    text_data = "This is some text data"
+    s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
+
+    # Read text data from S3
+    text_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_TXT)
+    logger.info(f"Read text data from S3: {text_data_read}")
+    # Write binary data to S3
+    binary_data = b"This is some binary data"
+    s3_reader_writer.write(data=text_data, s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
+
+    # Read binary data from S3
+    binary_data_read = s3_reader_writer.read(s3_relative_path=f"s3://{bucket_name}/ebook/test/test.json", mode=MODE_BIN)
+    logger.info(f"Read binary data from S3: {binary_data_read}")
+
+    # Range Read text data from S3
+    binary_data_read = s3_reader_writer.read_jsonl(path=f"s3://{bucket_name}/ebook/test/test.json",
+                                                   byte_start=0, byte_end=10, mode=MODE_BIN)
+    logger.info(f"Read binary data from S3: {binary_data_read}")
diff --git a/magic_pdf/rw/__init__.py b/magic_pdf/rw/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/spark/__init__.py b/magic_pdf/spark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/spark/spark_api.py b/magic_pdf/spark/spark_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf0045ac7518049a5b38f0623a8cb500d2cddd24
--- /dev/null
+++ b/magic_pdf/spark/spark_api.py
@@ -0,0 +1,51 @@
+from loguru import logger
+
+from magic_pdf.libs.drop_reason import DropReason
+
+
+def get_data_source(jso: dict):
+    data_source = jso.get("data_source")
+    if data_source is None:
+        data_source = jso.get("file_source")
+    return data_source
+
+
+def get_data_type(jso: dict):
+    data_type = jso.get("data_type")
+    if data_type is None:
+        data_type = jso.get("file_type")
+    return data_type
+
+
+def get_bookid(jso: dict):
+    book_id = jso.get("bookid")
+    if book_id is None:
+        book_id = jso.get("original_file_id")
+    return book_id
+
+
+def exception_handler(jso: dict, e):
+    logger.exception(e)
+    jso["_need_drop"] = True
+    jso["_drop_reason"] = DropReason.Exception
+    jso["_exception"] = f"ERROR: {e}"
+    return jso
+
+
+def get_bookname(jso: dict):
+    data_source = get_data_source(jso)
+    file_id = jso.get("file_id")
+    book_name = f"{data_source}/{file_id}"
+    return book_name
+
+
+def spark_json_extractor(jso: dict) -> dict:
+
+    """
+    从json中提取数据，返回一个dict
+    """
+
+    return {
+        "_pdf_type": jso["_pdf_type"],
+        "model_list": jso["doc_layout_result"],
+    }
diff --git a/magic_pdf/train_utils/__init__.py b/magic_pdf/train_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/magic_pdf/train_utils/convert_to_train_format.py b/magic_pdf/train_utils/convert_to_train_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..c50e6b3b3df51ad8958db1654ab891ea4ac01204
--- /dev/null
+++ b/magic_pdf/train_utils/convert_to_train_format.py
@@ -0,0 +1,65 @@
+def convert_to_train_format(jso: dict) -> []:
+    pages = []
+    for k, v in jso.items():
+        if not k.startswith("page_"):
+            continue
+        page_idx = v["page_idx"]
+        width, height = v["page_size"]
+
+        info = {"page_info": {"page_no": page_idx, "height": height, "width": width}}
+
+        bboxes: list[dict] = []
+        for img_bbox in v["image_bboxes_with_caption"]:
+            bbox = {"category_id": 1, "bbox": img_bbox["bbox"]}
+            if "caption" in img_bbox:
+                bbox["caption_bbox"] = img_bbox["caption"]
+            bboxes.append(bbox)
+
+        for tbl_bbox in v["table_bboxes_with_caption"]:
+            bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]}
+            if "caption" in tbl_bbox:
+                bbox["caption_bbox"] = tbl_bbox["caption"]
+            bboxes.append(bbox)
+
+        for bbox in v["bak_page_no_bboxes"]:
+            n_bbox = {"category_id": 4, "bbox": bbox}
+            bboxes.append(n_bbox)
+
+        for bbox in v["bak_header_bboxes"]:
+            n_bbox = {"category_id": 3, "bbox": bbox}
+            bboxes.append(n_bbox)
+
+        for bbox in v["bak_footer_bboxes"]:
+            n_bbox = {"category_id": 6, "bbox": bbox}
+            bboxes.append(n_bbox)
+
+        # 脚注， 目前没有看到例子
+        for para in v["para_blocks"]:
+            if "paras" in para:
+                paras = para["paras"]
+                for para_key, para_content in paras.items():
+                    para_bbox = para_content["para_bbox"]
+                    is_para_title = para_content["is_para_title"]
+                    if is_para_title:
+                        n_bbox = {"category_id": 0, "bbox": para_bbox}
+                    else:
+                        n_bbox = {"category_id": 2, "bbox": para_bbox}
+                    bboxes.append(n_bbox)
+
+        for inline_equation in v["inline_equations"]:
+            n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]}
+            bboxes.append(n_bbox)
+
+        for inter_equation in v["interline_equations"]:
+            n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]}
+            bboxes.append(n_bbox)
+
+        for footnote_bbox in v["bak_footer_note_bboxes"]:
+            n_bbox = {"category_id": 5, "bbox": list(footnote_bbox)}
+            bboxes.append(n_bbox)
+
+        info["bboxes"] = bboxes
+        info["layout_tree"] = v["layout_bboxes"]
+        pages.append(info)
+
+    return pages
diff --git a/magic_pdf/train_utils/extract_caption.py b/magic_pdf/train_utils/extract_caption.py
new file mode 100644
index 0000000000000000000000000000000000000000..74e0c51d05a917e1864a0003cd572ff9c508626a
--- /dev/null
+++ b/magic_pdf/train_utils/extract_caption.py
@@ -0,0 +1,59 @@
+from magic_pdf.libs.boxbase import _is_in
+
+
+def extract_caption_bbox(outer: list, inner: list) -> list:
+    """
+    ret: list of {
+                    "bbox": [1,2,3,4],
+                    "caption": [5,6,7,8] # may existed
+                }
+
+    """
+    found_count = 0  # for debug
+    print(outer, inner)
+
+    def is_float_equal(a, b):
+        if 0.01 > abs(a - b):  # non strict float equal compare
+            return True
+        return False
+
+    outer_h = {i: outer[i] for i in range(len(outer))}
+    ret = []
+    for v in inner:
+        ix0, iy0, ix1, iy1 = v
+        found_idx = None
+        d = {"bbox": v[:4]}
+        for k in outer_h:
+            ox0, oy0, ox1, oy1 = outer_h[k]
+            equal_float_flags = [
+                is_float_equal(ix0, ox0),
+                is_float_equal(iy0, oy0),
+                is_float_equal(ix1, ox1),
+                is_float_equal(iy1, oy1),
+            ]
+            if _is_in(v, outer_h[k]) and not all(equal_float_flags):
+                found_idx = k
+                break
+        if found_idx is not None:
+            found_count += 1
+            captions: list[list] = []
+            ox0, oy0, ox1, oy1 = outer_h[found_idx]
+            captions = [
+                [ox0, oy0, ix0, oy1],
+                [ox0, oy0, ox1, iy0],
+                [ox0, iy1, ox1, oy1],
+                [ix1, oy0, ox1, oy1],
+            ]
+            captions = sorted(
+                captions,
+                key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]),
+            )  # 面积最大的框就是caption
+            d["caption"] = captions[-1]
+            outer_h.pop(
+                found_idx
+            )  # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。
+
+        ret.append(d)
+
+    print("found_count: ", found_count)
+    return ret
diff --git a/magic_pdf/train_utils/remove_footer_header.py b/magic_pdf/train_utils/remove_footer_header.py
new file mode 100644
index 0000000000000000000000000000000000000000..57bd3e9e5d134dfe489a2bc1d377faf97db6afa8
--- /dev/null
+++ b/magic_pdf/train_utils/remove_footer_header.py
@@ -0,0 +1,159 @@
+import re
+
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
+
+
+"""
+    copy from pre_proc/remove_footer_header.py
+"""
+
+
+def remove_headder_footer_one_page(
+    text_raw_blocks,
+    image_bboxes,
+    table_bboxes,
+    header_bboxs,
+    footer_bboxs,
+    page_no_bboxs,
+    page_w,
+    page_h,
+):
+    """
+    删除页眉页脚，页码
+    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
+    """
+    if 1:
+        return image_bboxes, table_bboxes, text_raw_blocks, [], [], []
+
+    header = []
+    footer = []
+    if len(header) == 0:
+        model_header = header_bboxs
+        if model_header:
+            x0 = min([x for x, _, _, _ in model_header])
+            y0 = min([y for _, y, _, _ in model_header])
+            x1 = max([x1 for _, _, x1, _ in model_header])
+            y1 = max([y1 for _, _, _, y1 in model_header])
+            header = [x0, y0, x1, y1]
+    if len(footer) == 0:
+        model_footer = footer_bboxs
+        if model_footer:
+            x0 = min([x for x, _, _, _ in model_footer])
+            y0 = min([y for _, y, _, _ in model_footer])
+            x1 = max([x1 for _, _, x1, _ in model_footer])
+            y1 = max([y1 for _, _, _, y1 in model_footer])
+            footer = [x0, y0, x1, y1]
+
+    header_y0 = 0 if len(header) == 0 else header[3]
+    footer_y0 = page_h if len(footer) == 0 else footer[1]
+    if page_no_bboxs:
+        top_part = [b for b in page_no_bboxs if b[3] < page_h / 2]
+        btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2]
+
+        top_max_y0 = max([b[1] for b in top_part]) if top_part else 0
+        btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h
+
+        header_y0 = max(header_y0, top_max_y0)
+        footer_y0 = min(footer_y0, btn_min_y1)
+
+    content_boundry = [0, header_y0, page_w, footer_y0]
+
+    header = [0, 0, page_w, header_y0]
+    footer = [0, footer_y0, page_w, page_h]
+
+    """以上计算出来了页眉页脚的边界，下面开始进行删除"""
+    text_block_to_remove = []
+    # 首先检查每个textblock
+    for blk in text_raw_blocks:
+        if len(blk["lines"]) > 0:
+            for line in blk["lines"]:
+                line_del = []
+                for span in line["spans"]:
+                    span_del = []
+                    if span["bbox"][3] < header_y0:
+                        span_del.append(span)
+                    elif _is_in_or_part_overlap(
+                        span["bbox"], header
+                    ) or _is_in_or_part_overlap(span["bbox"], footer):
+                        span_del.append(span)
+                for span in span_del:
+                    line["spans"].remove(span)
+                if not line["spans"]:
+                    line_del.append(line)
+
+            for line in line_del:
+                blk["lines"].remove(line)
+        else:
+            # if not blk['lines']:
+            blk["tag"] = CONTENT_IN_FOOT_OR_HEADER
+            text_block_to_remove.append(blk)
+
+    """有的时候由于pageNo太小了，总是会有一点和content_boundry重叠一点，被放入正文，因此对于pageNo，进行span粒度的删除"""
+    page_no_block_2_remove = []
+    if page_no_bboxs:
+        for pagenobox in page_no_bboxs:
+            for block in text_raw_blocks:
+                if _is_in_or_part_overlap(
+                    pagenobox, block["bbox"]
+                ):  # 在span级别删除页码
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            if _is_in_or_part_overlap(pagenobox, span["bbox"]):
+                                # span['text'] = ''
+                                span["tag"] = PAGE_NO
+                                # 检查这个block是否只有这一个span，如果是，那么就把这个block也删除
+                                if len(line["spans"]) == 1 and len(block["lines"]) == 1:
+                                    page_no_block_2_remove.append(block)
+    else:
+        # 测试最后一个是不是页码：规则是，最后一个block仅有1个line,一个span,且text是数字，空格，符号组成，不含字母,并且包含数字
+        if len(text_raw_blocks) > 0:
+            text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True)
+            last_block = text_raw_blocks[0]
+            if len(last_block["lines"]) == 1:
+                last_line = last_block["lines"][0]
+                if len(last_line["spans"]) == 1:
+                    last_span = last_line["spans"][0]
+                    if (
+                        last_span["text"].strip()
+                        and not re.search("[a-zA-Z]", last_span["text"])
+                        and re.search("[0-9]", last_span["text"])
+                    ):
+                        last_span["tag"] = PAGE_NO
+                        page_no_block_2_remove.append(last_block)
+
+    for b in page_no_block_2_remove:
+        text_block_to_remove.append(b)
+
+    for blk in text_block_to_remove:
+        if blk in text_raw_blocks:
+            text_raw_blocks.remove(blk)
+
+    text_block_remain = text_raw_blocks
+    image_bbox_to_remove = [
+        bbox
+        for bbox in image_bboxes
+        if not _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+
+    image_bbox_remain = [
+        bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+    table_bbox_to_remove = [
+        bbox
+        for bbox in table_bboxes
+        if not _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+    table_bbox_remain = [
+        bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry)
+    ]
+
+    #        1,                 2,                3
+    return (
+        image_bbox_remain,
+        table_bbox_remain,
+        text_block_remain,
+        text_block_to_remove,
+        image_bbox_to_remove,
+        table_bbox_to_remove,
+    )
diff --git a/magic_pdf/train_utils/vis_utils.py b/magic_pdf/train_utils/vis_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..996ae514098a9277f0e1a48c02a93ef7be08418d
--- /dev/null
+++ b/magic_pdf/train_utils/vis_utils.py
@@ -0,0 +1,327 @@
+from magic_pdf.libs.commons import fitz
+import os
+from magic_pdf.libs.coordinate_transform import get_scale_ratio
+
+
+def draw_model_output(
+    raw_pdf_doc: fitz.Document, paras_dict_arr: list[dict], save_path: str
+):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    """
+    
+        # {0: 'title',  # 标题
+    # 1: 'figure', # 图片
+    #  2: 'plain text',  # 文本
+    #  3: 'header',      # 页眉
+    #  4: 'page number', # 页码
+    #  5: 'footnote',    # 脚注
+    #  6: 'footer',      # 页脚
+    #  7: 'table',       # 表格
+    #  8: 'table caption',  # 表格描述
+    #  9: 'figure caption', # 图片描述
+    #  10: 'equation',      # 公式
+    #  11: 'full column',   # 单栏
+    #  12: 'sub column',    # 多栏
+    #  13: 'embedding',     # 嵌入公式
+    #  14: 'isolated'}      # 单行公式
+    
+    """
+
+    color_map = {
+        "body": fitz.pdfcolor["green"],
+        "non_body": fitz.pdfcolor["red"],
+    }
+    """
+    {"layout_dets": [], "subfield_dets": [], "page_info": {"page_no": 22, "height": 1650, "width": 1275}}
+    """
+    for i, page in enumerate(raw_pdf_doc):
+        v = paras_dict_arr[i]
+        page_idx = v["page_info"]["page_no"]
+        width = v["page_info"]["width"]
+        height = v["page_info"]["height"]
+
+        horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
+            paras_dict_arr[i], page
+        )
+
+        for order, block in enumerate(v["layout_dets"]):
+            L = block["poly"][0] / horizontal_scale_ratio
+            U = block["poly"][1] / vertical_scale_ratio
+            R = block["poly"][2] / horizontal_scale_ratio
+            D = block["poly"][5] / vertical_scale_ratio
+            # L += pageL          # 有的页面，artBox偏移了。不在（0,0）
+            # R += pageL
+            # U += pageU
+            # D += pageU
+            L, R = min(L, R), max(L, R)
+            U, D = min(U, D), max(U, D)
+            bbox = [L, U, R, D]
+            color = color_map["body"]
+            if block["category_id"] in (3, 4, 5, 6, 0):
+                color = color_map["non_body"]
+
+            rect = fitz.Rect(bbox)
+            page.draw_rect(rect, fill=None, width=0.5, overlay=True, color=color)
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    raw_pdf_doc.save(save_path)
+
+
+def debug_show_bbox(
+    raw_pdf_doc: fitz.Document,
+    page_idx: int,
+    bboxes: list,
+    droped_bboxes: list,
+    expect_drop_bboxes: list,
+    save_path: str,
+    expected_page_id: int,
+):
+    """
+    以覆盖的方式写个临时的pdf，用于debug
+    """
+    if page_idx != expected_page_id:
+        return
+
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open("")
+
+    width = raw_pdf_doc[page_idx].rect.width
+    height = raw_pdf_doc[page_idx].rect.height
+    new_page = doc.new_page(width=width, height=height)
+
+    shape = new_page.new_shape()
+    for bbox in bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(
+            color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2
+        )
+        shape.finish()
+        shape.commit()
+
+    for bbox in droped_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+
+    for bbox in expect_drop_bboxes:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor["red"], fill=None)
+        shape.finish()
+        shape.commit()
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close()
+
+
+def debug_show_page(
+    page,
+    bboxes1: list,
+    bboxes2: list,
+    bboxes3: list,
+):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open("")
+
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(
+            color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2
+        )
+        shape.finish()
+        shape.commit()
+
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor["red"], fill=None)
+        shape.finish()
+        shape.commit()
+
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    doc.save(save_path)
+    doc.close()
+
+
+def draw_layout_bbox_on_page(
+    raw_pdf_doc: fitz.Document, paras_dict: dict, header, footer, pdf_path: str
+):
+    """
+    在page上画出bbox，保存到save_path
+    """
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open("")
+
+    for k, v in paras_dict.items():
+        page_idx = v["page_idx"]
+        layouts = v["layout_bboxes"]
+        page = doc[page_idx]
+        shape = page.new_shape()
+        for order, layout in enumerate(layouts):
+            border_offset = 1
+            rect_box = layout["layout_bbox"]
+            layout_label = layout["layout_label"]
+            fill_color = fitz.pdfcolor["pink"] if layout_label == "U" else None
+            rect_box = [
+                rect_box[0] + 1,
+                rect_box[1] - border_offset,
+                rect_box[2] - 1,
+                rect_box[3] + border_offset,
+            ]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.4)
+            """
+            draw order text on layout box
+            """
+            font_size = 10
+            shape.insert_text(
+                (rect_box[0] + 1, rect_box[1] + font_size),
+                f"{order}",
+                fontsize=font_size,
+                color=(0, 0, 0),
+            )
+
+        """画上footer header"""
+        if header:
+            shape.draw_rect(fitz.Rect(header))
+            shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2)
+        if footer:
+            shape.draw_rect(fitz.Rect(footer))
+            shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2)
+
+        shape.commit()
+
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
+
+
+@DeprecationWarning
+def draw_layout_on_page(
+    raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str
+):
+    """
+    把layout的box用红色边框花在pdf_path的page_idx上
+    """
+
+    def draw(shape, layout, fill_color=fitz.pdfcolor["pink"]):
+        border_offset = 1
+        rect_box = layout["layout_bbox"]
+        layout_label = layout["layout_label"]
+        sub_layout = layout["sub_layout"]
+        if len(sub_layout) == 0:
+            fill_color = fill_color if layout_label == "U" else None
+            rect_box = [
+                rect_box[0] + 1,
+                rect_box[1] - border_offset,
+                rect_box[2] - 1,
+                rect_box[3] + border_offset,
+            ]
+            rect = fitz.Rect(*rect_box)
+            shape.draw_rect(rect)
+            shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.2)
+            # if layout_label=='U':
+            #     bad_boxes = layout.get("bad_boxes", [])
+            #     for bad_box in bad_boxes:
+            #         rect = fitz.Rect(*bad_box)
+            #         shape.draw_rect(rect)
+            #         shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2)
+        # else:
+        #     rect = fitz.Rect(*rect_box)
+        #     shape.draw_rect(rect)
+        #     shape.finish(color=fitz.pdfcolor['blue'])
+
+        for sub_layout in sub_layout:
+            draw(shape, sub_layout)
+        shape.commit()
+
+    # 检查文件是否存在
+    is_new_pdf = False
+    if os.path.exists(pdf_path):
+        # 打开现有的 PDF 文件
+        doc = fitz.open(pdf_path)
+    else:
+        # 创建一个新的空白 PDF 文件
+        is_new_pdf = True
+        doc = fitz.open("")
+
+    page = doc[page_idx]
+    shape = page.new_shape()
+    for order, layout in enumerate(page_layout):
+        draw(shape, layout, fitz.pdfcolor["yellow"])
+
+    # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12,
+    #                      color=(0, 0, 0))
+    # shape.finish(color=fitz.pdfcolor['black'])
+    # shape.commit()
+
+    parent_dir = os.path.dirname(pdf_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+
+    if is_new_pdf:
+        doc.save(pdf_path)
+    else:
+        doc.saveIncr()
+    doc.close()
diff --git a/magic_pdf/user_api.py b/magic_pdf/user_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf36477294355a716400af177ce1a7f75a2e5806
--- /dev/null
+++ b/magic_pdf/user_api.py
@@ -0,0 +1,103 @@
+"""
+用户输入：
+    model数组，每个元素代表一个页面
+    pdf在s3的路径
+    截图保存的s3位置
+
+然后：
+    1）根据s3路径，调用spark集群的api,拿到ak,sk,endpoint，构造出s3PDFReader
+    2）根据用户输入的s3地址，调用spark集群的api,拿到ak,sk,endpoint，构造出s3ImageWriter
+
+其余部分至于构造s3cli, 获取ak,sk都在code-clean里写代码完成。不要反向依赖！！！
+
+"""
+import re
+
+from loguru import logger
+
+from magic_pdf.libs.version import __version__
+from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
+from magic_pdf.rw import AbsReaderWriter
+from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
+from magic_pdf.pdf_parse_by_txt import parse_pdf_by_txt
+
+PARSE_TYPE_TXT = "txt"
+PARSE_TYPE_OCR = "ocr"
+
+
+def parse_txt_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
+                  **kwargs):
+    """
+    解析文本类pdf
+    """
+    pdf_info_dict = parse_pdf_by_txt(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+
+    pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
+
+    pdf_info_dict["_version_name"] = __version__
+
+    return pdf_info_dict
+
+
+def parse_ocr_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0, *args,
+                  **kwargs):
+    """
+    解析ocr类pdf
+    """
+    pdf_info_dict = parse_pdf_by_ocr(
+        pdf_bytes,
+        pdf_models,
+        imageWriter,
+        start_page_id=start_page,
+        debug_mode=is_debug,
+    )
+
+    pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
+
+    pdf_info_dict["_version_name"] = __version__
+
+    return pdf_info_dict
+
+
+def parse_union_pdf(pdf_bytes: bytes, pdf_models: list, imageWriter: AbsReaderWriter, is_debug=False, start_page=0,
+                    input_model_is_empty: bool = False,
+                    *args, **kwargs):
+    """
+    ocr和文本混合的pdf，全部解析出来
+    """
+
+    def parse_pdf(method):
+        try:
+            return method(
+                pdf_bytes,
+                pdf_models,
+                imageWriter,
+                start_page_id=start_page,
+                debug_mode=is_debug,
+            )
+        except Exception as e:
+            logger.exception(e)
+            return None
+
+    pdf_info_dict = parse_pdf(parse_pdf_by_txt)
+    if pdf_info_dict is None or pdf_info_dict.get("_need_drop", False):
+        logger.warning(f"parse_pdf_by_txt drop or error, switch to parse_pdf_by_ocr")
+        if input_model_is_empty:
+            pdf_models = doc_analyze(pdf_bytes, ocr=True)
+        pdf_info_dict = parse_pdf(parse_pdf_by_ocr)
+        if pdf_info_dict is None:
+            raise Exception("Both parse_pdf_by_txt and parse_pdf_by_ocr failed.")
+        else:
+            pdf_info_dict["_parse_type"] = PARSE_TYPE_OCR
+    else:
+        pdf_info_dict["_parse_type"] = PARSE_TYPE_TXT
+
+    pdf_info_dict["_version_name"] = __version__
+
+    return pdf_info_dict
diff --git a/requirements-qa.txt b/requirements-qa.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d397f6cdfc083cf4f1d12c6f5865ea85fd7091df
--- /dev/null
+++ b/requirements-qa.txt
@@ -0,0 +1,16 @@
+pytest
+Levenshtein
+nltk
+rapidfuzz
+statistics
+openxlab #安装opendatalab
+pandas
+numpy
+matplotlib
+seaborn
+scipy
+scikit-learn
+tqdm
+htmltabletomd
+pypandoc
+pyopenssl==24.0.0
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b1665a8ec8bb80d739f0b1cc4c00e8af5a1f2f6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+boto3>=1.28.43
+Brotli>=1.1.0
+click>=8.1.7
+PyMuPDF>=1.24.7
+loguru>=0.6.0
+numpy>=1.21.6
+fast-langdetect>=0.2.1
+wordninja>=2.0.0
+scikit-learn>=1.0.2
+pdfminer.six>=20231228
+# The requirements.txt must ensure that only necessary external dependencies are introduced. If there are new dependencies to add, please contact the project administrator.
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cfcde7495fb851f8e550ab074e89c8cb4a90bcd
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,51 @@
+from pathlib import Path
+from setuptools import setup, find_packages
+from magic_pdf.libs.version import __version__
+
+
+def parse_requirements(filename):
+    with open(filename) as f:
+        lines = f.read().splitlines()
+
+    requires = []
+
+    for line in lines:
+        if "http" in line:
+            pkg_name_without_url = line.split('@')[0].strip()
+            requires.append(pkg_name_without_url)
+        else:
+            requires.append(line)
+
+    return requires
+
+
+if __name__ == '__main__':
+    with Path(Path(__file__).parent,
+              'README.md').open(encoding='utf-8') as file:
+        long_description = file.read()
+    setup(
+        name="magic_pdf",  # 项目名
+        version=__version__,  # 自动从tag中获取版本号
+        packages=find_packages() + ["magic_pdf.resources"],  # 包含所有的包
+        package_data={
+            "magic_pdf.resources": ["**"],  # 包含magic_pdf.resources目录下的所有文件
+        },
+        install_requires=parse_requirements('requirements.txt'),  # 项目依赖的第三方库
+        extras_require={
+            "gpu": ["paddleocr==2.7.3", "paddlepaddle-gpu"],
+            "cpu": ["paddleocr==2.7.3", "paddlepaddle"],
+            "full-cpu": ["unimernet", "matplotlib", "ultralytics", "paddleocr==2.7.3", "paddlepaddle"],
+        },
+        description="A practical tool for converting PDF to Markdown",  # 简短描述
+        long_description=long_description,  # 详细描述
+        long_description_content_type="text/markdown",  # 如果README是Markdown格式
+        url="https://github.com/opendatalab/MinerU",
+        python_requires=">=3.9",  # 项目依赖的 Python 版本
+        entry_points={
+            "console_scripts": [
+                "magic-pdf = magic_pdf.cli.magicpdf:cli"
+            ],
+        },  # 项目提供的可执行命令
+        include_package_data=True,  # 是否包含非代码文件，如数据文件、配置文件等
+        zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
+    )
diff --git a/update_version.py b/update_version.py
new file mode 100644
index 0000000000000000000000000000000000000000..b51081625a051ded30a2f228fd67468177c59c5e
--- /dev/null
+++ b/update_version.py
@@ -0,0 +1,27 @@
+import os
+import subprocess
+
+
+def get_version():
+    command = ["git", "describe", "--tags"]
+    try:
+        version = subprocess.check_output(command).decode().strip()
+        version_parts = version.split("-")
+        if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
+            return version_parts[1]
+        else:
+            raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
+    except Exception as e:
+        print(e)
+        return "0.0.0"
+
+
+def write_version_to_commons(version):
+    commons_path = os.path.join(os.path.dirname(__file__), 'magic_pdf', 'libs', 'version.py')
+    with open(commons_path, 'w') as f:
+        f.write(f'__version__ = "{version}"\n')
+
+
+if __name__ == '__main__':
+    version_name = get_version()
+    write_version_to_commons(version_name)