Document Upload Tutorial

Overview

Document Q&A allows users to upload documents and ask questions about their content. This tutorial covers:

  • Supported document formats
  • Document upload process
  • Document validation
  • Error handling
  • Best practices

Supported Document Formats

Document Q&A supports the following file formats:

  • PDF (.pdf) - Portable Document Format
  • TXT (.txt) - Plain text files
  • DOC (.doc) - Microsoft Word Document
  • DOCX (.docx) - Microsoft Word Open XML Document

Note: The maximum file size is 10MB. Files with scanned text should have clear, readable content for optimal results.

Document Upload Process

Frontend Implementation

The document upload component uses React's useDropzone hook for drag-and-drop functionality:

import { useDropzone } from 'react-dropzone';
import axios from 'axios';
import { useState } from 'react';

export default function FileUpload() {
  const [file, setFile] = useState(null);
  const [uploadProgress, setUploadProgress] = useState(0);
  const [isUploading, setIsUploading] = useState(false);

  const handleUpload = async (acceptedFiles) => {
    if (acceptedFiles.length === 0) return;
    const fileToUpload = acceptedFiles[0];
    setFile(fileToUpload);
    setIsUploading(true);
    setUploadProgress(0);

    const formData = new FormData();
    formData.append("file", fileToUpload);

    try {
      const response = await axios.post(
        "/api/upload",
        formData,
        {
          headers: {
            "Content-Type": "multipart/form-data",
          },
          onUploadProgress: (progressEvent) => {
            if (progressEvent.total) {
              const progress = Math.round(
                (progressEvent.loaded * 100) / progressEvent.total
              );
              setUploadProgress(progress);
            }
          },
        }
      );

      // Store document ID for later use
      localStorage.setItem("currentDocumentId", response.data.document_id);
      
      // Success notification
    } catch (error) {
      // Error handling
    } finally {
      // Reset upload state
      setTimeout(() => {
        setIsUploading(false);
        setUploadProgress(0);
      }, 1000);
    }
  };

  const { getRootProps, getInputProps, isDragActive } = useDropzone({
    onDrop: handleUpload,
    maxFiles: 1,
    accept: {
      "application/pdf": [".pdf"],
      "text/plain": [".txt"],
      "application/msword": [".doc"],
      "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        [".docx"],
    },
    maxSize: 10 * 1024 * 1024, // 10MB
    disabled: isUploading,
  });

  // Component JSX
}

Backend Implementation

The backend handles document validation, storage, and processing:

# FastAPI route
@router.post("/upload")
async def upload_document(file: UploadFile = File(...)) -> Dict[str, str]:
    """Upload a document for Q&A."""
    try:
        document_id = await document_service.save_document(file)
        return {
            "document_id": document_id,
            "message": "Document uploaded successfully"
        }
    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

# Document service
async def save_document(self, file: UploadFile) -> str:
    """Save an uploaded document and return its ID."""
    # Validate file
    await self._validate_file(file)
    
    # Generate unique ID
    document_id = str(uuid.uuid4())
    
    # Create directory if it doesn't exist
    os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
    
    # Save file
    file_path = os.path.join(settings.UPLOAD_DIR, f"{document_id}")
    async with aiofiles.open(file_path, 'wb') as out_file:
        content = await file.read()
        await out_file.write(content)
    
    return document_id

Document Validation

Document validation ensures that only supported file types and sizes are processed:

async def _validate_file(self, file: UploadFile) -> None:
    """Validate the uploaded file."""
    # Check file size
    content = await file.read()
    await file.seek(0)  # Reset file position
    
    if len(content) > settings.MAX_FILE_SIZE:
        raise ValueError(
            f"File size exceeds the maximum allowed size of "
            f"{settings.MAX_FILE_SIZE / (1024 * 1024):.1f}MB"
        )
    
    # Check MIME type
    mime_type = magic.from_buffer(content, mime=True)
    if mime_type not in SUPPORTED_MIME_TYPES:
        raise ValueError(
            f"Unsupported file type: {mime_type}. "
            f"Supported types: {', '.join(SUPPORTED_MIME_TYPES.keys())}"
        )

Error Handling

Proper error handling ensures a good user experience:

// Frontend error handling
try {
  // Upload code
} catch (error) {
  console.error("Upload error:", error);
  
  // Track error event
  trackEvent("document_upload_error", {
    documentSize: fileToUpload.size,
    documentType: fileToUpload.type,
    errorMessage: error instanceof Error ? error.message : "Unknown error",
  });
  
  // Show error notification
  toast({
    title: "Error",
    description: "Failed to upload file",
    variant: "destructive",
    duration: 3000,
  });
  
  setFile(null);
}

Best Practices

  • Validate on both client and server: Implement validation on both the frontend and backend for security.
  • Show upload progress: Provide visual feedback during uploads, especially for larger files.
  • Handle errors gracefully: Display user-friendly error messages and log detailed errors for debugging.
  • Secure file storage: Implement proper access controls and consider file encryption for sensitive documents.
  • Clean up temporary files: Implement a cleanup mechanism for documents that are no longer needed.
  • Optimize for performance: Consider using streaming uploads for large files and implement caching where appropriate.