diff --git a/README.md b/README.md index 58d3ce2c..56628645 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ This detector supports checking the following file types: * ✅ Images (supported) * ✅ PDF files (supported) * ✅ Videos (supported) -* ⏳ Files in compressed packages (planned) +* ✅ Files in compressed packages (supported) ## Quick Start @@ -46,16 +46,17 @@ Supported architectures: `x86_64`, `ARM64`. ### Use the API for Content Checking ```bash -# Detect images +# Detection curl -X POST -F "file=@/path/to/image.jpg" http://localhost:3333/check -# Detect PDF files -curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3333/pdf -# Detect video files -curl -X POST -F "file=@/path/to/file.mp4" http://localhost:3333/video ``` +### Use the Built-in Web Interface for Detection + +Visit: [http://localhost:3333](http://localhost:3333) + ## Public API + You can use the public API service provided by vx.link. ```bash diff --git a/README_cn.md b/README_cn.md index 633d0a2a..99b6b90c 100644 --- a/README_cn.md +++ b/README_cn.md @@ -29,7 +29,7 @@ * ✅ 图片(已支持) * ✅ PDF 文件(已支持) * ✅ 视频(已支持) -* ⏳ 压缩包中的文件(计划) +* ✅ 压缩包中的文件(已支持) ## 快速开始 @@ -44,13 +44,12 @@ docker run -d -p 3333:3333 --name nsfw-detector vxlink/nsfw_detector:latest ### 使用 API 进行内容检查 ```bash -# 检测图片 +# 检测 curl -X POST -F "file=@/path/to/image.jpg" http://localhost:3333/check -# 检测 PDF 文件 -curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3333/pdf -# 检测视频文件 -curl -X POST -F "file=@/path/to/file.mp4" http://localhost:3333/video -``` + +### 使用内置的 Web 界面进行检测 + +访问地址:[http://localhost:3333](http://localhost:3333) ## 公共 API diff --git a/README_jp.md b/README_jp.md index ff603896..89241243 100644 --- a/README_jp.md +++ b/README_jp.md @@ -28,7 +28,7 @@ * ✅ 画像(対応済み) * ✅ PDF(対応済み) * ✅ 動画(対応済み) -* ⏳ 圧縮ファイル内のファイル(対応予定) +* ✅ 圧縮ファイル内のファイル(対応済み) ## クイックスタート @@ -43,20 +43,20 @@ docker run -d -p 3333:3333 --name nsfw-detector vxlink/nsfw_detector:latest ### API を使用したコンテンツ確認 ```bash -# 画像の検出 +# 検出 curl -X POST -F "file=@/path/to/image.jpg" http://localhost:3333/check -# PDFの検出 -curl -X POST -F "file=@/path/to/file.pdf" http://localhost:3333/pdf -# 检测视频文件 -curl -X POST -F "file=@/path/to/file.mp4" http://localhost:3333/video ``` +### Web インターフェースを使用した検出 + +アクセス先:[http://localhost:3333](http://localhost:3333) + ## パブリック API vx.link が提供する公開 API サービスをご利用いただけます。 ```bash -# ファイルの検出、自動的にファイルタイプを識別します +# 検出 curl -X POST -F "file=@/path/to/image.jpg" https://vx.link/public/nsfw ``` diff --git a/app.py b/app.py index 5945ca90..8b382bf6 100644 --- a/app.py +++ b/app.py @@ -1,174 +1,179 @@ # app.py -from flask import Flask, request, jsonify -from transformers import pipeline -from PIL import Image +from flask import Flask, request, jsonify, send_file, Response +import tempfile import os -import fitz +import shutil +from pathlib import Path +from PIL import Image import io -import cv2 -import numpy as np -import tempfile +import time +import logging +import magic +from config import MAX_FILE_SIZE, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS +from utils import ArchiveHandler, can_process_file, sort_files_by_priority +from processors import process_image, process_pdf_file, process_video_file, process_archive -app = Flask(__name__) +# 配置日志 +logger = logging.getLogger(__name__) -os.environ['TRANSFORMERS_CACHE'] = '/root/.cache/huggingface' -pipe = pipeline("image-classification", model="Falconsai/nsfw_image_detection") +app = Flask(__name__) -@app.route('/check', methods=['POST']) -def check_image(): +# Load index.html content +CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) +with open(os.path.join(CURRENT_DIR, 'index.html'), 'r', encoding='utf-8') as f: + INDEX_HTML = f.read() + +@app.route('/') +def index(): + """Serve the index.html file""" + return Response(INDEX_HTML, mimetype='text/html') + +def detect_file_type(file_stream): + """检测文件类型 + Returns: (mime_type, file_extension) + """ + # 读取文件头部数据用于检测 + header = file_stream.read(2048) + file_stream.seek(0) # 重置文件指针 + + mime = magic.Magic(mime=True) + mime_type = mime.from_buffer(header) + + # 基于 MIME 类型映射文件扩展名 + mime_to_ext = { + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/webp': '.webp', + 'image/bmp': '.bmp', + 'application/pdf': '.pdf', + 'video/mp4': '.mp4', + 'video/x-msvideo': '.avi', + 'video/x-matroska': '.mkv', + 'video/quicktime': '.mov', + 'video/x-ms-wmv': '.wmv', + 'video/webm': '.webm', + 'application/x-rar-compressed': '.rar', + 'application/zip': '.zip', + 'application/x-7z-compressed': '.7z', + 'application/gzip': '.gz' + } + + return mime_type, mime_to_ext.get(mime_type) + +def process_file_by_type(file_stream, detected_type, original_filename): + """根据文件类型选择处理方法""" + mime_type, ext = detected_type + + # 如果有原始文件扩展名,优先使用 + if original_filename and '.' in original_filename: + original_ext = os.path.splitext(original_filename)[1].lower() + if original_ext in IMAGE_EXTENSIONS or original_ext == '.pdf' or original_ext in VIDEO_EXTENSIONS: + ext = original_ext + + if not ext: + return { + 'status': 'error', + 'message': 'Unsupported file type' + }, 400 + try: - if 'file' not in request.files: - return jsonify({'status': 'error', 'message': 'No file part'}), 400 - - file = request.files['file'] - if file.filename == '': - return jsonify({'status': 'error', 'message': 'No file selected'}), 400 - - try: - image = Image.open(file.stream) - result = pipe(image) - return jsonify({ + if ext in IMAGE_EXTENSIONS: + image = Image.open(file_stream) + result = process_image(image) + return { 'status': 'success', - 'filename': file.filename, + 'filename': original_filename, 'result': result - }) - except Exception as e: - return jsonify({ - 'status': 'error', - 'message': 'Invalid image file' - }), 400 - - except Exception as e: - return jsonify({'status': 'error', 'message': str(e)}), 500 - -@app.route('/pdf', methods=['POST']) -def check_pdf(): - try: - if 'file' not in request.files: - return jsonify({'status': 'error', 'message': 'No file part'}), 400 - - file = request.files['file'] - if file.filename == '': - return jsonify({'status': 'error', 'message': 'No file selected'}), 400 - - pdf_stream = file.read() - doc = fitz.open(stream=pdf_stream, filetype="pdf") - - last_result = None - images_found = False - - for page_num in range(len(doc)): - page = doc[page_num] - image_list = page.get_images() + } - for img_index, img in enumerate(image_list): - images_found = True - xref = img[0] - base_image = doc.extract_image(xref) - image_bytes = base_image["image"] - - image = Image.open(io.BytesIO(image_bytes)) - result = pipe(image) - - last_result = { + elif ext == '.pdf': + pdf_stream = file_stream.read() + result = process_pdf_file(pdf_stream) + if result: + return { 'status': 'success', - 'filename': file.filename, + 'filename': original_filename, 'result': result } + return { + 'status': 'error', + 'message': 'No processable content found in PDF' + }, 400 + + elif ext in VIDEO_EXTENSIONS: + # 创建临时文件并正确写入内容 + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=ext) + try: + # 读取文件内容并写入临时文件 + file_content = file_stream.read() + with open(temp_file.name, 'wb') as f: + f.write(file_content) - nsfw_score = next((item['score'] for item in result if item['label'] == 'nsfw'), 0) - normal_score = next((item['score'] for item in result if item['label'] == 'normal'), 1) - - if nsfw_score > 0.9 or normal_score < 0.6: - return jsonify(last_result) - - if not images_found: - return jsonify({'status': 'false', 'message': 'No images found in PDF'}) + result = process_video_file(temp_file.name) + if result: + return { + 'status': 'success', + 'filename': original_filename, + 'result': result + } + return { + 'status': 'error', + 'message': 'No processable content found in video' + }, 400 + finally: + if os.path.exists(temp_file.name): + os.unlink(temp_file.name) + + elif ext in {'.zip', '.rar', '.7z', '.gz'}: + return process_archive(file_stream, original_filename) + + else: + return { + 'status': 'error', + 'message': 'Unsupported file type' + }, 400 - return jsonify(last_result) - except Exception as e: - return jsonify({'status': 'error', 'message': str(e)}), 500 + logger.error(f"处理文件时出错: {str(e)}") + return { + 'status': 'error', + 'message': str(e) + }, 500 -@app.route('/video', methods=['POST']) -def check_video(): +@app.route('/check', methods=['POST']) +def check_file(): + """统一的文件检查入口点""" try: if 'file' not in request.files: - return jsonify({'status': 'error', 'message': 'No file part'}), 400 - - file = request.files['file'] - if file.filename == '': - return jsonify({'status': 'error', 'message': 'No file selected'}), 400 - - temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') - file.save(temp_file.name) - - cap = cv2.VideoCapture(temp_file.name) - if not cap.isOpened(): - os.unlink(temp_file.name) return jsonify({ 'status': 'error', - 'message': 'Unable to read video file' + 'message': 'No file found' }), 400 - - fps = cap.get(cv2.CAP_PROP_FPS) - total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - duration = total_frames / fps if fps > 0 else 0 - - if duration < 1: - frame_positions = [0] - elif duration <= 10: - frame_positions = np.linspace(0, total_frames - 1, int(duration), dtype=int) - else: - frame_positions = np.linspace(0, total_frames - 1, 20, dtype=int) - - last_result = None - frames_processed = 0 - - for frame_pos in frame_positions: - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_pos) - ret, frame = cap.read() - - if not ret: - continue - - frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) - pil_image = Image.fromarray(frame_rgb) - - result = pipe(pil_image) - frames_processed += 1 - - last_result = { - 'status': 'success', - 'filename': file.filename, - 'frame_number': int(frame_pos), - 'timestamp': frame_pos / fps if fps > 0 else 0, - 'result': result - } - nsfw_score = next((item['score'] for item in result if item['label'] == 'nsfw'), 0) - normal_score = next((item['score'] for item in result if item['label'] == 'normal'), 1) - - if nsfw_score > 0.9 or normal_score < 0.6: - cap.release() - os.unlink(temp_file.name) - return jsonify(last_result) - - cap.release() - os.unlink(temp_file.name) - - if frames_processed == 0: + file = request.files['file'] + if file.filename == '': return jsonify({ 'status': 'error', - 'message': 'No frames could be processed from the video' + 'message': 'No file selected' }), 400 - return jsonify(last_result) + logger.info(f"接收到文件: {file.filename}") + + # 检测文件类型 + detected_type = detect_file_type(file.stream) + logger.info(f"检测到文件类型: {detected_type}") + + # 处理文件 + result = process_file_by_type(file.stream, detected_type, file.filename) + return jsonify(result) if isinstance(result, dict) else jsonify(result[0]), result[1] if isinstance(result, tuple) else 200 except Exception as e: - if 'temp_file' in locals(): - os.unlink(temp_file.name) - return jsonify({'status': 'error', 'message': str(e)}), 500 + logger.error(f"处理过程发生错误: {str(e)}") + return jsonify({ + 'status': 'error', + 'message': str(e) + }), 500 if __name__ == '__main__': app.run(host='0.0.0.0', port=3333) \ No newline at end of file diff --git a/build.sh b/build.sh index f33163b4..ceda8cd1 100644 --- a/build.sh +++ b/build.sh @@ -1 +1,148 @@ -docker buildx bake \ No newline at end of file +#!/bin/bash +set -e + +# 默认配置值 +IMAGE_NAME="vxlink/nsfw_detector" +VERSION="v1.0" +PUSH="false" +CACHE_DIR="${HOME}/.docker/nsfw_detector_cache" +CACHE_FROM="" + +# 检测本机平台 +NATIVE_PLATFORM=$(docker version -f '{{.Server.Os}}/{{.Server.Arch}}' | sed 's/x86_64/amd64/') + +# 设置目标平台(默认包含所有支持的平台) +ALL_PLATFORMS="linux/amd64,linux/arm64" +PLATFORM="$NATIVE_PLATFORM" # 默认仅构建本机平台 + +# 帮助信息显示函数 +show_help() { + echo "Usage: $0 [options]" + echo "Options:" + echo " -p, --push Push images to registry after building (default: false)" + echo " -v, --version Specify version tag (default: v0.3)" + echo " -h, --help Show this help message" + echo " --no-cache Disable build cache" + echo " --platform Specify target platforms (default: native platform)" + echo " --all-platforms Build for all supported platforms" +} + +# 解析命令行参数 +while [[ $# -gt 0 ]]; do + case $1 in + -p|--push) + PUSH="true" + PLATFORM="$ALL_PLATFORMS" # 推送时默认构建所有平台 + shift + ;; + -v|--version) + VERSION="$2" + shift 2 + ;; + --no-cache) + CACHE_FROM="--no-cache" + shift + ;; + --platform) + PLATFORM="$2" + shift 2 + ;; + --all-platforms) + PLATFORM="$ALL_PLATFORMS" + shift + ;; + -h|--help) + show_help + exit 0 + ;; + *) + echo "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +echo "Building with configuration:" +echo "- Version: ${VERSION}" +echo "- Push to registry: ${PUSH}" +echo "- Native platform: ${NATIVE_PLATFORM}" +echo "- Target platforms: ${PLATFORM}" +echo "- Cache enabled: $([ -z "$CACHE_FROM" ] && echo "yes" || echo "no")" + +# 创建缓存目录(如果不存在) +mkdir -p "${CACHE_DIR}" + +# 配置 buildx 构建器 +BUILDER="nsfw-detector-builder" +if ! docker buildx inspect "${BUILDER}" > /dev/null 2>&1; then + docker buildx create --name "${BUILDER}" \ + --driver docker-container \ + --driver-opt network=host \ + --buildkitd-flags '--allow-insecure-entitlement security.insecure' \ + --use +else + docker buildx use "${BUILDER}" +fi + +# 设置缓存配置参数 +if [ -z "$CACHE_FROM" ]; then + CACHE_CONFIG="--cache-from=type=local,src=${CACHE_DIR} --cache-to=type=local,dest=${CACHE_DIR},mode=max" +else + CACHE_CONFIG="$CACHE_FROM" +fi + +# 构建基础命令 +BUILD_CMD="docker buildx build \ + --platform ${PLATFORM} \ + --tag ${IMAGE_NAME}:${VERSION} \ + --tag ${IMAGE_NAME}:latest \ + --file dockerfile \ + ${CACHE_CONFIG} \ + --build-arg BUILDKIT_INLINE_CACHE=1" + +if [ "$PUSH" = "true" ]; then + # 远程构建模式:推送到仓库 + BUILD_CMD="${BUILD_CMD} --push" +elif [ "$PLATFORM" = "$NATIVE_PLATFORM" ]; then + # 本地构建模式(单一本机平台):使用 --load + BUILD_CMD="${BUILD_CMD} --load" +else + # 本地构建模式(多平台或非本机平台):输出到本地 docker 镜像 + echo "Warning: Building for non-native platform(s). Images will be available through docker buildx, but not in regular docker images list." +fi + +BUILD_CMD="${BUILD_CMD} ." + +# 执行构建 +echo "Executing build command..." +eval ${BUILD_CMD} + +# 验证构建结果(仅在推送模式下) +if [ "$PUSH" = "true" ]; then + echo "Verifying manifest for version ${VERSION}..." + docker manifest inspect ${IMAGE_NAME}:${VERSION} + + echo "Verifying manifest for latest..." + docker manifest inspect ${IMAGE_NAME}:latest +fi + +# 清理和切换构建器 +if [ "$PUSH" = "true" ]; then + docker buildx use default +else + echo "Build completed for platform(s): ${PLATFORM}" +fi + +echo "Build complete!" +echo "Built images:" +echo "- ${IMAGE_NAME}:${VERSION}" +echo "- ${IMAGE_NAME}:latest" + +if [ "$PUSH" = "true" ]; then + echo "Images have been pushed to registry" +elif [ "$PLATFORM" = "$NATIVE_PLATFORM" ]; then + echo "Images are available locally via 'docker images'" +else + echo "Images are available through buildx" +fi \ No newline at end of file diff --git a/config.py b/config.py new file mode 100644 index 00000000..c5fedcd0 --- /dev/null +++ b/config.py @@ -0,0 +1,34 @@ +# config.py +import os +import rarfile +import logging + +# 配置日志 +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) + +# 配置 rarfile +rarfile.UNRAR_TOOL = "unrar" +rarfile.PATH_SEP = '/' + +# 使用新的环境变量名称 HF_HOME 替代 TRANSFORMERS_CACHE +os.environ['HF_HOME'] = '/root/.cache/huggingface' + +# 文件扩展名配置 +IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'} +VIDEO_EXTENSIONS = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.ts', '.flv', '.webm'} +ARCHIVE_EXTENSIONS = {'.7z', '.rar', '.zip', '.gz'} + +# HTTP 配置 +HTTP_HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' +} + +# 文件大小限制 (20480MB) +MAX_FILE_SIZE = 20 * 1024 * 1024 * 1024 + +# 超时设置 +DOWNLOAD_TIMEOUT = 30 \ No newline at end of file diff --git a/docker-bake.hcl b/docker-bake.hcl deleted file mode 100644 index d9fb27d7..00000000 --- a/docker-bake.hcl +++ /dev/null @@ -1,31 +0,0 @@ -variable "DOCKER_USERNAME" { - default = "vxlink" -} - -variable "IMAGE_NAME" { - default = "nsfw_detector" -} - -variable "VERSION" { - default = "v0.3" -} - -group "default" { - targets = ["app"] -} - -target "app" { - context = "." - dockerfile = "Dockerfile" - tags = [ - "${DOCKER_USERNAME}/${IMAGE_NAME}:${VERSION}", - "${DOCKER_USERNAME}/${IMAGE_NAME}:latest" - ] - cache-from = ["type=registry,ref=${DOCKER_USERNAME}/${IMAGE_NAME}:cache"] - cache-to = ["type=registry,ref=${DOCKER_USERNAME}/${IMAGE_NAME}:cache,mode=max"] - push = true - platforms = [ - "linux/amd64", // x86_64 - "linux/arm64", // ARM64 - ] -} \ No newline at end of file diff --git a/dockerfile b/dockerfile index b36a18e3..a3ca8e38 100644 --- a/dockerfile +++ b/dockerfile @@ -1,13 +1,9 @@ -# Use Ubuntu as base image FROM ubuntu:22.04 -# Set working directory WORKDIR /app -# Avoid interactive prompts ENV DEBIAN_FRONTEND=noninteractive -# Install system packages RUN apt-get update && apt-get install -y \ python3 \ python3-pip \ @@ -18,36 +14,28 @@ RUN apt-get update && apt-get install -y \ libgl1-mesa-glx \ libglib2.0-0 \ libsm6 \ + libmagic1 \ libxext6 \ libxrender-dev \ && rm -rf /var/lib/apt/lists/* -# Install OpenCV +RUN pip3 install python-magic RUN pip3 install --no-cache-dir opencv-python-headless - -# Install rarfile and py7zr RUN pip3 install --no-cache-dir rarfile py7zr - -# Install Flask and its dependencies RUN pip3 install --no-cache-dir flask==2.0.1 werkzeug==2.0.3 - -# Install Pillow RUN pip3 install --no-cache-dir Pillow - -# Install transformers RUN pip3 install --no-cache-dir transformers - -# Install torch (CPU version) +RUN pip3 install --no-cache-dir PyMuPDF RUN pip3 install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu -# Pre-download the model RUN python3 -c "from transformers import pipeline; pipe = pipeline('image-classification', model='Falconsai/nsfw_image_detection', device=-1)" -# Set cache directory permissions RUN chmod -R 755 /root/.cache -# Copy local application file COPY app.py /app/app.py +COPY config.py /app/config.py +COPY processors.py /app/processors.py +COPY utils.py /app/utils.py +COPY index.html /app/index.html -# Run the application -CMD ["python3", "app.py"] \ No newline at end of file +CMD ["python3", "app.py"] diff --git a/index.html b/index.html new file mode 100644 index 00000000..e8b00ddf --- /dev/null +++ b/index.html @@ -0,0 +1,404 @@ + + + +
+ + +