python批量提取PDF文件中指定区域的文本
Page content
python批量提取PDF文件中指定区域的文本。
print_formatted_text_blocks打印PDF中所有文本块的坐标,extract_text_from_pdf提取指定区域的文本,batch_rename_pdfs批量根据提取的文本重命名PDF文件。
import fitz # PyMuPDF
import os
def print_formatted_text_blocks(pdf_path):
"""打印PDF中所有文本块的坐标,格式化为 {'x1':..., 'y1':..., 'x2':..., 'y2':..., 'name':...}"""
doc = fitz.open(pdf_path)
print(f"\n# PDF文本块坐标列表(可直接复制用于提取)\nregions = [")
for page_num in range(len(doc)):
page = doc.load_page(page_num)
blocks = page.get_text("dict", flags=0)["blocks"]
for block_idx, block in enumerate(blocks):
for line in block["lines"]:
for span in line["spans"]:
rect = span["bbox"] # [x1, y1, x2, y2]
text = span["text"].strip()
if text: # 忽略空文本
# 格式化输出为字典
print(f" {{'x1': {rect[0]:.1f}, 'y1': {rect[1]:.1f}, "
f"'x2': {rect[2]:.1f}, 'y2': {rect[3]:.1f} }}, # 第{page_num + 1}页 区块{block_idx + 1}")
print("]\n")
doc.close()
def extract_text_from_pdf(pdf_path, regions,page_num=0):
"""从PDF文件中提取指定区域的文本,按regions顺序返回元组
支持两种坐标格式:
1. 左上角+右下角: {'x1':..., 'y1':..., 'x2':..., 'y2':...}
2. 左上角+宽高: {'x':..., 'y':..., 'width':..., 'height':...}
未提取到内容的区域返回空字符串
"""
doc = fitz.open(pdf_path)
results = [] # 存储提取结果,按regions顺序
if len(doc) == 0:
print("错误: PDF文件没有页面!")
doc.close()
return tuple([''] * len(regions)) # 全空元组
# 默认为第一页,如需处理多页可扩展参数
page = doc.load_page(page_num)
for region in regions:
# 解析坐标
if 'x1' in region and 'y1' in region and 'x2' in region and 'y2' in region:
# 格式1:直接使用左上角(x1,y1)和右下角(x2,y2)
x1, y1 = region['x1'], region['y1']
x2, y2 = region['x2'], region['y2']
elif 'x' in region and 'y' in region and 'width' in region and 'height' in region:
# 格式2:通过左上角(x,y)和宽高计算右下角
x1, y1 = region['x'], region['y']
x2, y2 = x1 + region['width'], y1 + region['height']
else:
print(f"警告: 区域坐标无效,跳过: {region}")
results.append('') # 无效区域返回空字符串
continue
# 提取区域文本
rect = fitz.Rect(x1, y1, x2, y2)
text = page.get_text("text", clip=rect).strip()
# 结果存入列表(空文本返回空字符串)
results.append(text if text else '')
doc.close()
return tuple(results) # 转换为元组返回
def batch_rename_pdfs(folder_path):
"""
批量重命名指定文件夹中的PDF文件
参数:
folder_path (str): 包含PDF文件的文件夹路径
返回:
list: 包含重命名结果的字典列表,每个字典包含原始文件名、新文件名和操作结果
"""
# 定义提取区域(按顺序)
regions = [
{'x1': 106.5, 'y1': 81.1, 'x2': 133.5, 'y2': 90.1},
{'x1': 496.0, 'y1': 109.1, 'x2': 541.0, 'y2': 118.1},
]
results = []
# 遍历文件夹中的所有文件
for filename in os.listdir(folder_path):
# 检查是否为PDF文件
if filename.lower().endswith('.pdf'):
pdf_path = os.path.join(folder_path, filename)
# 跳过文件夹
if os.path.isdir(pdf_path):
continue
# 提取文本信息
try:
_name, _date = extract_text_from_pdf(pdf_path, regions)
except Exception as e:
results.append({
'original': filename,
'new': None,
'status': f'提取失败: {str(e)}'
})
continue
# 替换文件名中的非法字符
valid_name = _name.replace('/', '_').replace('\\', '_').replace(':', '_') \
.replace('*', '_').replace('?', '_').replace('"', '_') \
.replace('<', '_').replace('>', '_').replace('|', '_')
valid_date = _date.replace('/', '_').replace('\\', '_').replace(':', '_') \
.replace('*', '_').replace('?', '_').replace('"', '_') \
.replace('<', '').replace('>', '_').replace('|', '_')
# 构建新文件名
new_filename = f"{valid_name}_{valid_date}.pdf"
# 避免空文件名
if new_filename == "_":
new_filename = f"extracted_{os.path.splitext(filename)[0]}.pdf"
new_path = os.path.join(folder_path, new_filename)
# 检查新文件是否已存在
if os.path.exists(new_path):
results.append({
'original': filename,
'new': new_filename,
'status': '已存在,未重命名'
})
continue
# 执行重命名
try:
os.rename(pdf_path, new_path)
results.append({
'original': filename,
'new': new_filename,
'status': '重命名成功'
})
except Exception as e:
results.append({
'original': filename,
'new': new_filename,
'status': f'重命名失败: {str(e)}'
})
return results
# # 示例用法
# if __name__ == "__main__":
# # 定义提取区域(按顺序)
# regions = [
# {'x1': 106.5, 'y1': 81.1, 'x2': 133.5, 'y2': 90.1},
# {'x1': 496.0, 'y1': 109.1, 'x2': 541.0, 'y2': 118.1},
# ]
# pdf_path = "1752567478483.pdf" # 替换为你的PDF路径
# extracted = extract_text_from_pdf(pdf_path, regions)
# print("提取结果(元组):")
# print(extracted)
# 示例用法
if __name__ == "__main__":
folder_path = r"C:\Users\Administrator\Downloads" # 替换为你的PDF文件夹路径
results = batch_rename_pdfs(folder_path)
print("批量重命名结果:")
for result in results:
print(f"原文件: {result['original']}")
if result['new']:
print(f"新文件: {result['new']}")
print(f"状态: {result['status']}")
print("-" * 50)