From fafe6c03ce91b13833b6950800b123252636dfe4 Mon Sep 17 00:00:00 2001
From: aralyekta <aralyektayarimca@gmail.com>
Date: Wed, 30 Apr 2025 08:05:22 +0000
Subject: [PATCH 1/7] Fix ingestion pagination for jira / zendesk

---
 .../backend/core/requester.py                 | 53 ++++++++++++-------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/gurubase-backend/backend/core/requester.py b/src/gurubase-backend/backend/core/requester.py
index 2f8097ca..5015a3a4 100644
--- a/src/gurubase-backend/backend/core/requester.py
+++ b/src/gurubase-backend/backend/core/requester.py
@@ -947,7 +947,7 @@ def list_issues(self, jql_query, start=0, max_results=50):
         List Jira issues using JQL query with pagination
         Args:
             jql_query (str): JQL query string to filter issues
-            start (int): Starting index for pagination
+            start (int): Starting index for pagination (unused, kept for compatibility)
             max_results (int): Maximum number of results to fetch per request
         Returns:
             list: List of Jira issues matching the query
@@ -955,24 +955,39 @@ def list_issues(self, jql_query, start=0, max_results=50):
             ValueError: If API request fails
         """
         try:
-            # Get issues using JQL
-            issues_data = self.jira.jql(jql_query, start=start, limit=max_results)
-            issues = []
+            all_issues = []
+            current_start = 0
+            page_size = max_results
             
-            for issue in issues_data.get('issues', []):
-                formatted_issue = {
-                    'id': issue.get('id'),
-                    # 'key': issue.get('key'),
-                    # 'summary': issue.get('fields', {}).get('summary'),
-                    # 'issue_type': issue.get('fields', {}).get('issuetype', {}).get('name'),
-                    # 'status': issue.get('fields', {}).get('status', {}).get('name'),
-                    # 'priority': issue.get('fields', {}).get('priority', {}).get('name'),
-                    # 'assignee': issue.get('fields', {}).get('assignee', {}).get('displayName'),
-                    'link': f"{self.url}/browse/{issue.get('key')}"
-                }
-                issues.append(formatted_issue)
+            while True:
+                # Get issues using JQL
+                issues_data = self.jira.jql(jql_query, start=current_start, limit=page_size)
+                issues = issues_data.get('issues', [])
+                
+                if not issues:
+                    break
+                    
+                for issue in issues:
+                    formatted_issue = {
+                        'id': issue.get('id'),
+                        # 'key': issue.get('key'),
+                        # 'summary': issue.get('fields', {}).get('summary'),
+                        # 'issue_type': issue.get('fields', {}).get('issuetype', {}).get('name'),
+                        # 'status': issue.get('fields', {}).get('status', {}).get('name'),
+                        # 'priority': issue.get('fields', {}).get('priority', {}).get('name'),
+                        # 'assignee': issue.get('fields', {}).get('assignee', {}).get('displayName'),
+                        'link': f"{self.url}/browse/{issue.get('key')}"
+                    }
+                    all_issues.append(formatted_issue)
+                
+                # If we got fewer issues than requested, we've reached the end
+                if len(issues) < page_size:
+                    break
+                    
+                # Move to the next page
+                current_start += page_size
                 
-            return issues
+            return all_issues
         except Exception as e:
             logger.error(f"Error listing Jira issues: {str(e)}", exc_info=True)
             if "401" in str(e):
@@ -1296,7 +1311,7 @@ def list_articles(self, batch_size=100):
                     if article.get('draft') is False:
                         all_articles.append(self._format_article(article))
 
-                url = data.get('next_page')
+                url = data.get('links', {}).get('next')
             return all_articles
         except requests.exceptions.RequestException as e:
             status_code = e.response.status_code if e.response is not None else None
@@ -1384,7 +1399,7 @@ def _get_article_comments(self, article_id, batch_size=100):
                 comments = data.get('comments', [])
                 all_comments.extend([self._format_article_comment(comment) for comment in comments])
 
-                url = data.get('next_page')
+                url = data.get('links', {}).get('next')
             all_comments.sort(key=lambda x: x['created_at'])
             return all_comments
         except requests.exceptions.RequestException as e:

From b58103e314b96ed64dac43c5d862562cf13a339c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aral=20Yekta=20Yar=C4=B1mca?=
 <44121565+aralyekta@users.noreply.github.com>
Date: Wed, 30 Apr 2025 01:15:18 -0700
Subject: [PATCH 2/7] Update requester.py

Reduce confluence page size to 25
---
 src/gurubase-backend/backend/core/requester.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gurubase-backend/backend/core/requester.py b/src/gurubase-backend/backend/core/requester.py
index 5015a3a4..0ab5d903 100644
--- a/src/gurubase-backend/backend/core/requester.py
+++ b/src/gurubase-backend/backend/core/requester.py
@@ -1975,7 +1975,7 @@ def list_pages(self, cql=None):
                 try:
                     # Use pagination to get all pages from the space
                     page_start = 0
-                    page_limit = 100  # Fetch 100 pages at a time
+                    page_limit = 25  # Fetch 25 pages at a time
                     while True:
                         pages_batch = self.confluence.get_all_pages_from_space(
                             space_key, 
@@ -2007,7 +2007,7 @@ def list_pages(self, cql=None):
                 # Implement pagination for the CQL query as well
                 cql_page_ids = set()
                 cql_start = 0
-                cql_limit = 100  # Fetch 100 results at a time
+                cql_limit = 25  # Fetch 100 results at a time
                 
                 while True:
                     cql_results = self.confluence.cql(cql, start=cql_start, limit=cql_limit)
@@ -2104,7 +2104,7 @@ def get_page_comments(self, page_id, start=0, limit=50):
             
             # Use pagination to get all comments
             page_start = 0
-            page_limit = 100  # Fetch 100 comments at a time
+            page_limit = 25  # Fetch 100 comments at a time
             while True:
                 try:
                     # The Confluence API doesn't support direct pagination for comments
@@ -2178,7 +2178,7 @@ def list_spaces(self, start=0, limit=50):
             
             # Use pagination to get all spaces
             space_start = 0
-            space_limit = 100  # Fetch 100 spaces at a time
+            space_limit = 25  # Fetch 100 spaces at a time
             while True:
                 spaces_data = self.confluence.get_all_spaces(start=space_start, limit=space_limit)
                 results = spaces_data.get('results', [])

From 92c812c0d4e0ac94079b4afecb8b2ea264fb28fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aral=20Yekta=20Yar=C4=B1mca?=
 <44121565+aralyekta@users.noreply.github.com>
Date: Wed, 30 Apr 2025 01:16:45 -0700
Subject: [PATCH 3/7] Update requester.py

Update comments
---
 src/gurubase-backend/backend/core/requester.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gurubase-backend/backend/core/requester.py b/src/gurubase-backend/backend/core/requester.py
index 0ab5d903..5bf16575 100644
--- a/src/gurubase-backend/backend/core/requester.py
+++ b/src/gurubase-backend/backend/core/requester.py
@@ -2007,7 +2007,7 @@ def list_pages(self, cql=None):
                 # Implement pagination for the CQL query as well
                 cql_page_ids = set()
                 cql_start = 0
-                cql_limit = 25  # Fetch 100 results at a time
+                cql_limit = 25  # Fetch 25 results at a time
                 
                 while True:
                     cql_results = self.confluence.cql(cql, start=cql_start, limit=cql_limit)
@@ -2104,7 +2104,7 @@ def get_page_comments(self, page_id, start=0, limit=50):
             
             # Use pagination to get all comments
             page_start = 0
-            page_limit = 25  # Fetch 100 comments at a time
+            page_limit = 25  # Fetch 25 comments at a time
             while True:
                 try:
                     # The Confluence API doesn't support direct pagination for comments
@@ -2178,7 +2178,7 @@ def list_spaces(self, start=0, limit=50):
             
             # Use pagination to get all spaces
             space_start = 0
-            space_limit = 25  # Fetch 100 spaces at a time
+            space_limit = 25  # Fetch 25 spaces at a time
             while True:
                 spaces_data = self.confluence.get_all_spaces(start=space_start, limit=space_limit)
                 results = spaces_data.get('results', [])

From 234a0799f31c585a0bef22a36d422f93695d08e2 Mon Sep 17 00:00:00 2001
From: aralyekta <aralyektayarimca@gmail.com>
Date: Wed, 30 Apr 2025 12:10:56 +0000
Subject: [PATCH 4/7] Add scripts to populate ingestion sources

---
 .../scripts/onetime/populate_confluence.py    | 104 +++++++++
 .../backend/scripts/onetime/populate_jira.py  | 112 ++++++++++
 .../scripts/onetime/populate_zendesk.py       | 210 ++++++++++++++++++
 3 files changed, 426 insertions(+)
 create mode 100644 src/gurubase-backend/backend/scripts/onetime/populate_confluence.py
 create mode 100644 src/gurubase-backend/backend/scripts/onetime/populate_jira.py
 create mode 100644 src/gurubase-backend/backend/scripts/onetime/populate_zendesk.py

diff --git a/src/gurubase-backend/backend/scripts/onetime/populate_confluence.py b/src/gurubase-backend/backend/scripts/onetime/populate_confluence.py
new file mode 100644
index 00000000..c824b28f
--- /dev/null
+++ b/src/gurubase-backend/backend/scripts/onetime/populate_confluence.py
@@ -0,0 +1,104 @@
+# Create confluence spaces and pages
+import os
+import django
+import random
+import string
+from datetime import datetime
+import sys
+import time  # Add time module for throttling
+# Set up Django environment
+sys.path.append('/workspaces/gurubase/src/gurubase-backend/backend')
+sys.path.append('/workspaces/gurubase-backend/backend')
+sys.path.append('/workspace/backend')
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend.settings")
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'backend.settings')
+django.setup()
+
+from core.models import Integration, GuruType
+from core.requester import ConfluenceRequester
+from django.contrib.auth import get_user_model
+
+User = get_user_model()
+
+def generate_random_text(length=1000):
+    """Generate random text for page content"""
+    words = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit']
+    return ' '.join(random.choices(words, k=length))
+
+def create_confluence_spaces_and_pages():
+    # Get or create a Confluence integration
+    integration = Integration.objects.get(
+        type=Integration.Type.CONFLUENCE,
+        guru_type=GuruType.objects.get(slug='gurubase')
+    )
+
+    # Initialize Confluence requester
+    confluence = ConfluenceRequester(integration)
+
+    # Create 5 spaces
+    space_names = [
+        'Engineering Documentation',
+        'Product Requirements',
+        'Design Guidelines',
+        'Development Standards',
+        'Project Management'
+    ]
+
+    for space_name in space_names:
+        try:
+            # Generate space key
+            space_key = ''.join(random.choices(string.ascii_uppercase, k=3))
+            
+            # Check if space exists
+            try:
+                existing_space = confluence.confluence.get_space(space_key)
+                print(f"Space {space_name} ({space_key}) already exists, skipping creation")
+                space = existing_space
+            except Exception:
+                # Space doesn't exist, create it
+                space = confluence.confluence.create_space(
+                    space_key=space_key,
+                    space_name=space_name,
+                )
+                print(f"Created space: {space_name} ({space_key})")
+
+            # Create 500 pages in each space
+            for i in range(300):
+                page_title = f"Page {i+1} - {space_name}"
+                page_content = f"""
+                # {page_title}
+                
+                Created on: {datetime.now().isoformat()}
+                
+                {generate_random_text()}
+                """
+                
+                # Create page
+                page = confluence.confluence.create_page(
+                    space=space_key,
+                    title=page_title,
+                    body=page_content,
+                    type='page'
+                )
+                
+                # Add throttling between page creation and comments
+                time.sleep(1)  # Wait 1 second between page creation and comments
+                
+                # Add some comments to the page
+                for j in range(3):  # Add 3 comments per page
+                    comment_content = f"Comment {j+1} on {page_title}\n\n{generate_random_text(100)}"
+                    confluence.confluence.add_comment(
+                        page_id=page['id'],
+                        text=comment_content
+                    )
+                    time.sleep(0.5)  # Wait 0.5 seconds between comment additions
+                
+                print(f"Created page {i+1}/500 in space {space_name}")
+                time.sleep(2)  # Wait 2 seconds between page creations
+                
+        except Exception as e:
+            print(f"Error creating space {space_name}: {str(e)}")
+            continue
+
+if __name__ == '__main__':
+    create_confluence_spaces_and_pages()
\ No newline at end of file
diff --git a/src/gurubase-backend/backend/scripts/onetime/populate_jira.py b/src/gurubase-backend/backend/scripts/onetime/populate_jira.py
new file mode 100644
index 00000000..05977c45
--- /dev/null
+++ b/src/gurubase-backend/backend/scripts/onetime/populate_jira.py
@@ -0,0 +1,112 @@
+# Create jira issues in Done status
+
+import os
+import sys
+import time
+import django
+
+# Set up Django environment
+sys.path.append('/workspaces/gurubase/src/gurubase-backend/backend')
+sys.path.append('/workspace/backend')
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend.settings")
+django.setup()
+
+from core.models import Integration, GuruType
+from core.requester import JiraRequester
+
+def throttle_request(func):
+    """Decorator to throttle API requests"""
+    last_request_time = 0
+    min_interval = 1.0  # Minimum seconds between requests
+    
+    def wrapper(*args, **kwargs):
+        nonlocal last_request_time
+        current_time = time.time()
+        time_since_last = current_time - last_request_time
+        
+        if time_since_last < min_interval:
+            sleep_time = min_interval - time_since_last
+            time.sleep(sleep_time)
+        
+        result = func(*args, **kwargs)
+        last_request_time = time.time()
+        return result
+    
+    return wrapper
+
+class ThrottledJiraRequester(JiraRequester):
+    """Extended JiraRequester with throttling"""
+    
+    @throttle_request
+    def create_issue(self, project_key, summary, description, issue_type="Task", priority="Medium"):
+        """Create a Jira issue with throttling"""
+        fields = {
+            'project': {'key': project_key},
+            'summary': summary,
+            'description': description,
+            'issuetype': {'name': issue_type},
+        }
+        
+        return self.jira.create_issue(fields=fields)
+    
+    @throttle_request
+    def add_comment(self, issue_key, comment):
+        """Add a comment to a Jira issue with throttling"""
+        return self.jira.issue_add_comment(issue_key, comment)
+
+    @throttle_request
+    def transitions(self, issue_key):
+        """Get available transitions for an issue"""
+        return self.jira.get_issue_transitions(issue_key)
+
+
+def create_jira_content():
+    # Get the Jira integration
+    integration = Integration.objects.get(
+        type=Integration.Type.JIRA,
+        guru_type=GuruType.objects.get(slug='gurubase')
+    )
+
+    # Initialize Jira requester
+    jira = ThrottledJiraRequester(integration)
+
+    # Get available projects
+    try:
+        projects = jira.jira.projects()
+        if not projects:
+            print("No projects found in Jira. Please create at least one project first.")
+            return
+        
+        project_keys = [project['key'] for project in projects]
+        print(f"Found projects: {', '.join(project_keys)}")
+    except Exception as e:
+        print(f"Error fetching projects: {e}")
+        return
+
+    # Create 100 issues
+    print("Creating Jira issues...")
+    for i in range(500):
+        try:
+            # Create issue
+            issue = jira.create_issue(
+                project_key=project_keys[0],  # Use first project
+                summary=f"Test Issue {i+1}",
+                description="Test description",
+                issue_type="Task"
+            )
+            
+            # Transition to Done
+            transitions = jira.transitions(issue['key'])
+            for transition in transitions:
+                if transition['to'].lower() == 'done':
+                    jira.jira.issue_transition(issue['key'], 'done')
+                    break
+            
+            print(f"Created issue {i+1}/100: {issue['key']}")
+            
+        except Exception as e:
+            print(f"Error creating issue {i+1}: {e}")
+            continue
+
+if __name__ == '__main__':
+    create_jira_content()
\ No newline at end of file
diff --git a/src/gurubase-backend/backend/scripts/onetime/populate_zendesk.py b/src/gurubase-backend/backend/scripts/onetime/populate_zendesk.py
new file mode 100644
index 00000000..cd1a2876
--- /dev/null
+++ b/src/gurubase-backend/backend/scripts/onetime/populate_zendesk.py
@@ -0,0 +1,210 @@
+# Create zendesk tickets and articles
+import os
+import django
+import random
+import string
+import sys
+from datetime import datetime, timedelta
+
+import requests
+
+# Set up Django environment
+sys.path.append('/workspaces/gurubase/src/gurubase-backend/backend')
+sys.path.append('/workspace/backend')
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend.settings")
+django.setup()
+
+from core.models import Integration, GuruType
+from core.requester import ZendeskRequester
+
+def generate_random_text(length=1000):
+    """Generate random text for content"""
+    words = ['lorem', 'ipsum', 'dolor', 'sit', 'amet', 'consectetur', 'adipiscing', 'elit']
+    return ' '.join(random.choices(words, k=length))
+
+def create_zendesk_content():
+    # Get the Zendesk integration
+    integration = Integration.objects.get(
+        type=Integration.Type.ZENDESK,
+        guru_type=GuruType.objects.get(slug='gurubase')
+    )
+
+    # Initialize Zendesk requester
+    zendesk = ZendeskRequester(integration)
+
+    # Create 100 tickets
+    ticket_subjects = [
+        "Bug Report: Application Crash",
+        "Feature Request: New Dashboard",
+        "Support: Login Issues",
+        "Question: API Documentation",
+        "Feedback: User Interface",
+        "Issue: Performance Problems",
+        "Request: Account Access",
+        "Problem: Data Synchronization",
+        "Inquiry: Pricing Plans",
+        "Report: Security Concern"
+    ]
+
+    ticket_priorities = ['urgent', 'high', 'normal', 'low']
+    ticket_statuses = ['new', 'open', 'pending', 'solved']
+
+    # print("Creating tickets...")
+    # for i in range(100):
+    #     try:
+    #         # Create ticket with random subject and priority
+    #         subject = f"{random.choice(ticket_subjects)} #{i+1}"
+    #         description = f"""
+    #         Issue Description:
+    #         {generate_random_text()}
+            
+    #         Steps to Reproduce:
+    #         1. {generate_random_text(50)}
+    #         2. {generate_random_text(50)}
+    #         3. {generate_random_text(50)}
+            
+    #         Expected Behavior:
+    #         {generate_random_text()}
+            
+    #         Actual Behavior:
+    #         {generate_random_text()}
+    #         """
+            
+    #         # Create ticket using Zendesk API
+    #         ticket_data = {
+    #             'ticket': {
+    #                 'subject': subject,
+    #                 'comment': {
+    #                     'body': description
+    #                 },
+    #                 'priority': random.choice(ticket_priorities),
+    #                 'status': random.choice(ticket_statuses)
+    #             }
+    #         }
+            
+    #         # Add ticket using requests since ZendeskRequester doesn't have create method
+    #         import requests
+    #         response = requests.post(
+    #             f"https://{integration.zendesk_domain}/api/v2/tickets.json",
+    #             json=ticket_data,
+    #             auth=(f"{integration.zendesk_user_email}/token", integration.zendesk_api_token)
+    #         )
+    #         response.raise_for_status()
+            
+    #         # Add 3 comments to the ticket
+    #         ticket_id = response.json()['ticket']['id']
+    #         for j in range(3):
+    #             comment_data = {
+    #                 'ticket': {
+    #                     'comment': {
+    #                         'body': f"Comment {j+1} on ticket {subject}\n\n{generate_random_text(200)}",
+    #                         'public': True
+    #                     }
+    #                 }
+    #             }
+                
+    #             requests.put(
+    #                 f"https://{integration.zendesk_domain}/api/v2/tickets/{ticket_id}.json",
+    #                 json=comment_data,
+    #                 auth=(f"{integration.zendesk_user_email}/token", integration.zendesk_api_token)
+    #             )
+            
+    #         print(f"Created ticket {i+1}/100: {subject}")
+            
+    #     except Exception as e:
+    #         print(f"Error creating ticket {i+1}: {str(e)}")
+    #         continue
+
+    # Create 50 help center articles
+    article_categories = [
+        "Getting Started",
+        "User Guide",
+        "API Documentation",
+        "Troubleshooting",
+        "Best Practices"
+    ]
+
+    print("\nCreating help center section...")
+    try:
+        # First create a section
+        section_data = {
+            'section': {
+                'name': 'General Documentation',
+                'locale': 'en-us'
+            }
+        }
+        
+        response = requests.get(
+            f"https://{integration.zendesk_domain}/api/v2/help_center/en-us/sections.json",
+            auth=(f"{integration.zendesk_user_email}/token", integration.zendesk_api_token)
+        )
+        response.raise_for_status()
+        section_id = response.json()['sections'][0]['id']
+        print(f"Created section with ID: {section_id}")
+    except Exception as e:
+        print(f"Error creating section: {str(e)}")
+        return
+
+    print("\nCreating help center articles...")
+    for i in range(50):
+        try:
+            # Create article with random category
+            title = f"Article {i+1}: {random.choice(article_categories)} Guide"
+            body = f"""
+            <h1>{title}</h1>
+            
+            <h2>Overview</h2>
+            <p>{generate_random_text(5)}</p>
+            """
+            
+            # Create article using Zendesk API
+            article_data = {
+                'article': {
+                    'title': title,
+                    'body': body,
+                    'locale': 'en-us',
+                    'section_id': 78910,  # Replace with your actual section ID
+                    'label_names': ['documentation', 'guide'],
+                    'comments_disabled': False
+                }
+            }
+            
+            # Add article using requests with proper headers
+            headers = {
+                'Content-Type': 'application/json'
+            }
+            
+            response = requests.post(
+                f"https://{integration.zendesk_domain}/api/v2/help_center/articles.json",
+                json=article_data,
+                headers=headers,
+                auth=(f"{integration.zendesk_user_email}/token", integration.zendesk_api_token)
+            )
+            response.raise_for_status()
+            
+            # Add 2 comments to the article
+            article_id = response.json()['article']['id']
+            for j in range(2):
+                comment_data = {
+                    'comment': {
+                        'body': f"Comment {j+1} on article {title}\n\n{generate_random_text(100)}",
+                        'public': True
+                    }
+                }
+                
+                requests.post(
+                    f"https://{integration.zendesk_domain}/api/v2/help_center/articles/{article_id}/comments.json",
+                    json=comment_data,
+                    headers=headers,
+                    auth=(f"{integration.zendesk_user_email}/token", integration.zendesk_api_token)
+                )
+            
+            print(f"Created article {i+1}/50: {title}")
+            
+        except Exception as e:
+            print(f"Error creating article {i+1}: {str(e)}")
+            continue
+
+if __name__ == '__main__':
+    create_zendesk_content()
+    create_zendesk_content()
\ No newline at end of file

From 676fb1ecabfbc013728e9198e5f26a132de68516 Mon Sep 17 00:00:00 2001
From: aralyekta <aralyektayarimca@gmail.com>
Date: Wed, 30 Apr 2025 13:28:06 +0000
Subject: [PATCH 5/7] Fix confluence pagination and filtering

---
 .../backend/core/requester.py                 | 118 ++++++++++++------
 1 file changed, 79 insertions(+), 39 deletions(-)

diff --git a/src/gurubase-backend/backend/core/requester.py b/src/gurubase-backend/backend/core/requester.py
index 5bf16575..55b6d10f 100644
--- a/src/gurubase-backend/backend/core/requester.py
+++ b/src/gurubase-backend/backend/core/requester.py
@@ -1962,8 +1962,78 @@ def list_pages(self, cql=None):
             ValueError: If API request fails
         """
         try:
-            # Get all pages from all spaces
+            # If CQL is provided, use it directly to get pages
+            if cql:
+                cql += " AND type=page"
+                all_pages = []
+                seen_page_ids = set()  # Track unique page IDs
+                cql_limit = 100  # Fetch 100 results at a time
+                
+                # Initial request
+                url = f"{self.url}/wiki/rest/api/search"
+                params = {
+                    'cql': cql,
+                    'limit': cql_limit,
+                    'expand': 'space'
+                }
+                
+                while True:
+                    response = requests.get(
+                        url,
+                        auth=(self.confluence.username, self.confluence.password),
+                        params=params,
+                        timeout=30
+                    )
+                    
+                    if response.status_code == 401:
+                        raise ValueError("Invalid Confluence credentials")
+                    elif response.status_code == 403:
+                        raise ValueError("Confluence API access forbidden")
+                    elif response.status_code != 200:
+                        raise ValueError(f"Confluence API request failed with status {response.status_code}")
+                    
+                    cql_results = response.json()
+                    results = cql_results.get('results', [])
+                    
+                    if not results:
+                        break
+                        
+                    for result in results:
+                        content = result.get('content', {})
+                        page_id = content.get('id')
+                        
+                        # Skip if we've seen this page ID before
+                        if not page_id or page_id in seen_page_ids:
+                            continue
+                            
+                        seen_page_ids.add(page_id)
+                        
+                        # Get space info from the expanded result
+                        space = content.get('space', {})
+                        space_key = space.get('key')
+                        space_name = space.get('name', '')
+                        
+                        # Format and add the page
+                        formatted_page = self._format_page(content, space_key, space_name)
+                        all_pages.append(formatted_page)
+                    
+                    # Check if there's a next page
+                    next_link = cql_results.get('_links', {}).get('next')
+                    if not next_link:
+                        break
+                        
+                    # Update URL and params for next request
+                    url = f"{self.url}/wiki{next_link}"
+                    params = {}  # Clear params as they're included in the next_link
+                
+                return {
+                    'pages': all_pages,
+                    'page_count': len(all_pages)
+                }
+            
+            # If no CQL, get all pages from all spaces
             all_pages = []
+            seen_page_ids = set()  # Track unique page IDs
             
             spaces_data = self.confluence.get_all_spaces()
             spaces = spaces_data.get('results', [])
@@ -1975,7 +2045,7 @@ def list_pages(self, cql=None):
                 try:
                     # Use pagination to get all pages from the space
                     page_start = 0
-                    page_limit = 25  # Fetch 25 pages at a time
+                    page_limit = 100  # Fetch 100 pages at a time
                     while True:
                         pages_batch = self.confluence.get_all_pages_from_space(
                             space_key, 
@@ -1987,8 +2057,12 @@ def list_pages(self, cql=None):
                             break
                             
                         for page in pages_batch:
-                            formatted_page = self._format_page(page, space_key, space_name)
-                            all_pages.append(formatted_page)
+                            page_id = page.get('id')
+                            # Only add page if we haven't seen its ID before
+                            if page_id and page_id not in seen_page_ids:
+                                seen_page_ids.add(page_id)
+                                formatted_page = self._format_page(page, space_key, space_name)
+                                all_pages.append(formatted_page)
                             
                         # If we got fewer pages than requested, we've reached the end
                         if len(pages_batch) < page_limit:
@@ -2002,40 +2076,6 @@ def list_pages(self, cql=None):
                     logger.warning(f"Error fetching pages from space {space_key}: {str(e)}")
                     continue
             
-            # Filter pages by CQL if provided
-            if cql:
-                # Implement pagination for the CQL query as well
-                cql_page_ids = set()
-                cql_start = 0
-                cql_limit = 25  # Fetch 25 results at a time
-                
-                while True:
-                    cql_results = self.confluence.cql(cql, start=cql_start, limit=cql_limit)
-                    results = cql_results.get('results', [])
-                    
-                    if not results:
-                        break
-                        
-                    for result in results:
-                        content = result.get('content', {})
-                        cql_page_ids.add(content.get('id'))
-                    
-                    # If we got fewer results than requested, we've reached the end
-                    if len(results) < cql_limit:
-                        break
-                        
-                    # Move to the next page
-                    cql_start += cql_limit
-                
-                # Filter the all_pages list to only include pages that match the CQL
-                filtered_pages = [page for page in all_pages if page.get('id') in cql_page_ids]
-                
-                return {
-                    'pages': filtered_pages,
-                    'page_count': len(filtered_pages)
-                }
-            
-            # No CQL filtering, just return all pages
             return {
                 'pages': all_pages,
                 'page_count': len(all_pages)
@@ -2104,7 +2144,7 @@ def get_page_comments(self, page_id, start=0, limit=50):
             
             # Use pagination to get all comments
             page_start = 0
-            page_limit = 25  # Fetch 25 comments at a time
+            page_limit = 100  # Fetch 100 comments at a time
             while True:
                 try:
                     # The Confluence API doesn't support direct pagination for comments

From d195d6d7709355beb8b252e7af85782806e8c11d Mon Sep 17 00:00:00 2001
From: aralyekta <aralyektayarimca@gmail.com>
Date: Wed, 30 Apr 2025 13:33:34 +0000
Subject: [PATCH 6/7] Improve confluence error handling

---
 src/gurubase-backend/backend/core/requester.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/gurubase-backend/backend/core/requester.py b/src/gurubase-backend/backend/core/requester.py
index 55b6d10f..a2604ef8 100644
--- a/src/gurubase-backend/backend/core/requester.py
+++ b/src/gurubase-backend/backend/core/requester.py
@@ -1990,7 +1990,14 @@ def list_pages(self, cql=None):
                     elif response.status_code == 403:
                         raise ValueError("Confluence API access forbidden")
                     elif response.status_code != 200:
-                        raise ValueError(f"Confluence API request failed with status {response.status_code}")
+                        if 'could not parse' in response.text.lower():
+                            raise ValueError(f"Invalid CQL query.")
+                        else:
+                            split = response.json().get('message', '').split(':', 1)
+                            if len(split) > 1:
+                                raise ValueError(split[1].strip())
+                            else:
+                                raise ValueError(f"Confluence API request failed with status {response.status_code}")
                     
                     cql_results = response.json()
                     results = cql_results.get('results', [])

From b43f337598858f92a0bdbca5e2d71b7153e1c690 Mon Sep 17 00:00:00 2001
From: aralyekta <aralyektayarimca@gmail.com>
Date: Wed, 30 Apr 2025 14:23:49 +0000
Subject: [PATCH 7/7] Add direct message support to slack

---
 .../core/integrations/slack_strategy.py       |  3 +-
 .../migrations/0083_integration_allow_dm.py   | 18 +++++++
 src/gurubase-backend/backend/core/models.py   |  2 +
 src/gurubase-backend/backend/core/views.py    | 54 +++++++++++++------
 src/gurubase-frontend/src/app/actions.js      |  9 +++-
 .../Integrations/ChannelsComponent.jsx        | 26 ++++++++-
 6 files changed, 93 insertions(+), 19 deletions(-)
 create mode 100644 src/gurubase-backend/backend/core/migrations/0083_integration_allow_dm.py

diff --git a/src/gurubase-backend/backend/core/integrations/slack_strategy.py b/src/gurubase-backend/backend/core/integrations/slack_strategy.py
index eb1a41d3..3cfb1742 100644
--- a/src/gurubase-backend/backend/core/integrations/slack_strategy.py
+++ b/src/gurubase-backend/backend/core/integrations/slack_strategy.py
@@ -55,7 +55,8 @@ def _list_channels() -> list:
                     {
                         'id': c['id'],
                         'name': c['name'],
-                        'allowed': False
+                        'allowed': False,
+                        'direct_messages': False
                     }
                     for c in data.get('channels', [])
                 ])
diff --git a/src/gurubase-backend/backend/core/migrations/0083_integration_allow_dm.py b/src/gurubase-backend/backend/core/migrations/0083_integration_allow_dm.py
new file mode 100644
index 00000000..c289212c
--- /dev/null
+++ b/src/gurubase-backend/backend/core/migrations/0083_integration_allow_dm.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.18 on 2025-04-30 14:16
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0082_gurutype_private'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='integration',
+            name='allow_dm',
+            field=models.BooleanField(default=False),
+        ),
+    ]
diff --git a/src/gurubase-backend/backend/core/models.py b/src/gurubase-backend/backend/core/models.py
index 91573997..57f1bc1c 100644
--- a/src/gurubase-backend/backend/core/models.py
+++ b/src/gurubase-backend/backend/core/models.py
@@ -2028,6 +2028,8 @@ class Type(models.TextChoices):
     zendesk_api_token = models.TextField(null=True, blank=True)
     zendesk_user_email = models.TextField(null=True, blank=True)
 
+    allow_dm = models.BooleanField(default=False)
+
     date_created = models.DateTimeField(auto_now_add=True)
     date_updated = models.DateTimeField(auto_now=True)
 
diff --git a/src/gurubase-backend/backend/core/views.py b/src/gurubase-backend/backend/core/views.py
index c869d960..8517bf3f 100644
--- a/src/gurubase-backend/backend/core/views.py
+++ b/src/gurubase-backend/backend/core/views.py
@@ -1941,13 +1941,18 @@ def manage_channels(request, guru_type, integration_type):
         try:
             channels = request.data.get('channels', [])
             integration.channels = channels
+
+            if 'allow_dm' in request.data:
+                integration.allow_dm = request.data['allow_dm']
+
             integration.save()
-            
+
             return Response({
                 'id': integration.id,
                 'type': integration.type,
                 'guru_type': integration.guru_type.slug,
-                'channels': integration.channels
+                'channels': integration.channels,
+                'allow_dm': integration.allow_dm
             })
         except Exception as e:
             logger.error(f"Error updating channels: {e}", exc_info=True)
@@ -2422,17 +2427,25 @@ async def send_channel_unauthorized_message(
     client: WebClient,
     channel_id: str,
     thread_ts: str,
-    guru_slug: str
+    guru_slug: str,
+    dm: bool
 ) -> None:
     """Send a message explaining how to authorize the channel."""
     try:
         base_url = await sync_to_async(get_base_url)()
         settings_url = f"{base_url.rstrip('/')}/guru/{guru_slug}/integrations/slack"
-        message = (
-            "❌ This channel is not authorized to use the bot.\n\n"
-            f"Please visit <{settings_url}|Gurubase Settings> to configure "
-            "the bot and add this channel to the allowed channels list."
-        )
+        if dm:
+            message = (
+                "❌ Bot direct messages are not enabled.\n\n"
+                f"Please visit <{settings_url}|Gurubase Settings> to configure "
+                "the bot and enable direct messages."
+            )
+        else:
+            message = (
+                "❌ This channel is not authorized to use the bot.\n\n"
+                f"Please visit <{settings_url}|Gurubase Settings> to configure "
+                "the bot and add this channel to the allowed channels list."
+            )
         client.chat_postMessage(
             channel=channel_id,
             thread_ts=thread_ts,
@@ -2465,12 +2478,17 @@ def process_event():
                 
                 # Only proceed if it's a message event and not from a bot
                 if event["type"] == "message" and "subtype" not in event and event.get("user") != event.get("bot_id"):
+                    dm = False
+                    if event['channel_type'] == 'im':
+                        dm = True
                     # Get bot user ID from authorizations
                     bot_user_id = data.get("authorizations", [{}])[0].get("user_id")
                     user_message = event["text"]
                     
                     # First check if the bot is mentioned
-                    if not (bot_user_id and f"<@{bot_user_id}>" in user_message):
+                    if not dm and not (bot_user_id and f"<@{bot_user_id}>" in user_message):
+                        return
+                    elif dm and event['user'] == bot_user_id:
                         return
                         
                     team_id = data.get('team_id')
@@ -2500,12 +2518,15 @@ def process_event():
                         channel_id = event["channel"]
                         
                         # Check if the current channel is allowed
-                        channels = integration.channels
-                        channel_allowed = False
-                        for channel in channels:
-                            if str(channel.get('id')) == channel_id and channel.get('allowed', False):
-                                channel_allowed = True
-                                break
+                        if not dm:
+                            channels = integration.channels
+                            channel_allowed = False
+                            for channel in channels:
+                                if str(channel.get('id')) == channel_id and channel.get('allowed', False):
+                                    channel_allowed = True
+                                    break
+                        else:
+                            channel_allowed = integration.allow_dm
 
                         # Get thread_ts if it exists (means we're in a thread)
                         thread_ts = event.get("thread_ts") or event.get("ts")
@@ -2519,7 +2540,8 @@ def process_event():
                                     client=client,
                                     channel_id=channel_id,
                                     thread_ts=thread_ts,
-                                    guru_slug=integration.guru_type.slug
+                                    guru_slug=integration.guru_type.slug,
+                                    dm=dm
                                 ))
                             finally:
                                 loop.close()
diff --git a/src/gurubase-frontend/src/app/actions.js b/src/gurubase-frontend/src/app/actions.js
index 6c7a214f..f3119a9e 100644
--- a/src/gurubase-frontend/src/app/actions.js
+++ b/src/gurubase-frontend/src/app/actions.js
@@ -918,12 +918,19 @@ export async function saveIntegrationChannels(
   channels
 ) {
   try {
+    const payload =
+      integrationType === "SLACK" &&
+      typeof channels === "object" &&
+      "direct_messages" in channels
+        ? { channels: channels.channels, allow_dm: channels.direct_messages }
+        : { channels };
+
     const response = await makeAuthenticatedRequest(
       `${process.env.NEXT_PUBLIC_BACKEND_FETCH_URL}/${guruType}/integrations/${integrationType}/channels/`,
       {
         method: "POST",
         headers: { "Content-Type": "application/json" },
-        body: JSON.stringify({ channels })
+        body: JSON.stringify(payload)
       }
     );
 
diff --git a/src/gurubase-frontend/src/components/Integrations/ChannelsComponent.jsx b/src/gurubase-frontend/src/components/Integrations/ChannelsComponent.jsx
index 10a26e3a..bda004d3 100644
--- a/src/gurubase-frontend/src/components/Integrations/ChannelsComponent.jsx
+++ b/src/gurubase-frontend/src/components/Integrations/ChannelsComponent.jsx
@@ -42,6 +42,7 @@ const ChannelsComponent = ({
   const [hasChanges, setHasChanges] = useState(false);
   const [isSaving, setIsSaving] = useState(false);
   const [open, setOpen] = useState(false);
+  const [directMessages, setDirectMessages] = useState(false);
 
   useEffect(() => {
     const fetchChannels = async () => {
@@ -60,6 +61,7 @@ const ChannelsComponent = ({
         } else {
           setChannels(channelsData?.channels || []);
           setOriginalChannels(channelsData?.channels || []);
+          setDirectMessages(channelsData?.allow_dm || false);
           setInternalError(null);
         }
       } catch (err) {
@@ -116,6 +118,23 @@ const ChannelsComponent = ({
       </div>
       {/* Allowed Channels */}
       <div className="space-y-4 guru-xs:mt-4 mt-5">
+        {type === "slack" && (
+          <div className="flex items-center gap-2 mb-4">
+            <input
+              type="checkbox"
+              id="direct_messages"
+              checked={directMessages}
+              onChange={(e) => {
+                setDirectMessages(e.target.checked);
+                setHasChanges(true);
+              }}
+              className="h-4 w-4 rounded border-gray-300"
+            />
+            <label htmlFor="direct_messages" className="text-sm text-gray-700">
+              Enable Direct Messages
+            </label>
+          </div>
+        )}
         {channels
           .filter((c) => c.allowed)
           .map((channel) => (
@@ -272,7 +291,12 @@ const ChannelsComponent = ({
               const response = await saveIntegrationChannels(
                 guruData?.slug,
                 type.toUpperCase(),
-                channels.filter((c) => c.allowed)
+                type === "slack"
+                  ? {
+                      channels: channels.filter((c) => c.allowed),
+                      direct_messages: directMessages
+                    }
+                  : channels.filter((c) => c.allowed)
               );
               if (!response?.error) {
                 setHasChanges(false);