Skip to content

Task/GitHub codebases #295

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 48 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
4f5e3c6
Update guru edit page
aralyekta Apr 21, 2025
b5a1db0
Improve filter
aralyekta Apr 21, 2025
0ee040e
Fix pending state + name for jira and zendesk
aralyekta Apr 21, 2025
8310284
Fix polling
aralyekta Apr 21, 2025
7e2213e
Fix table visual
aralyekta Apr 21, 2025
145b5c4
Fix github badge visuals
aralyekta Apr 21, 2025
765ba96
Add github sidebar + form management
aralyekta Apr 21, 2025
580f9f8
Remove index_repo, github_repos fields from guru type and link github…
aralyekta Apr 21, 2025
84e9e8c
Improve github repo visuals
aralyekta Apr 22, 2025
388e8af
Add last indexed date tooltip
aralyekta Apr 22, 2025
9497be4
Add comments
aralyekta Apr 22, 2025
3a51775
Return last indexed date
aralyekta Apr 22, 2025
f567d46
Fix glob update and remove unnecessary code
aralyekta Apr 22, 2025
7e41114
Add existing repo check
aralyekta Apr 22, 2025
a05a818
Add tooltip
aralyekta Apr 22, 2025
681053d
Improve typography
aralyekta Apr 22, 2025
a93f16d
Add reindex support for github data
aralyekta Apr 22, 2025
724c649
Fix pending status when github repo is added
aralyekta Apr 22, 2025
40412cc
Merge branch 'develop' of https://github.com/Gurubase/gurubase into t…
aralyekta Apr 24, 2025
45a41ac
Fix NewGuru
aralyekta Apr 24, 2025
95c37ac
Fix migrations
aralyekta Apr 24, 2025
73179ad
Fix confluence
aralyekta Apr 24, 2025
73d492a
Fix names in source table
aralyekta Apr 24, 2025
a904d50
Fix github repo creation
aralyekta Apr 24, 2025
bae8394
Fix polling state on initial guru edit load
aralyekta Apr 24, 2025
830ab12
Turn data source creation to dropdown
aralyekta Apr 24, 2025
6e4188c
Fix data source creation dropdown layout
aralyekta Apr 24, 2025
3767c20
Fix clickability of zendesk/confluence when creating a new guru
aralyekta Apr 24, 2025
86447d3
Fix references having the same titles overwriting each other
aralyekta Apr 24, 2025
12b1402
Move github details logic into data sources instead of guru types
aralyekta Apr 24, 2025
41c8b6f
Add migration
aralyekta Apr 24, 2025
7ba403d
Remove duplicate code
aralyekta Apr 24, 2025
69a1322
Fix github update
aralyekta Apr 24, 2025
110d118
Exclude extra fields in data source
aralyekta Apr 24, 2025
8d82261
Fix processing status of repositories when they are fetched but not w…
aralyekta Apr 24, 2025
69a896c
Add comment
aralyekta Apr 24, 2025
4b39b24
Set deleting source text while deleting sources
aralyekta Apr 24, 2025
2c73ad3
Fix pending state management during deletion
aralyekta Apr 24, 2025
5eca3da
Fix glob fields on backend
aralyekta Apr 24, 2025
5dbffc3
Merge branch 'develop' of https://github.com/Gurubase/gurubase into t…
aralyekta Apr 24, 2025
9cf6942
Fix FE confluence limit
aralyekta Apr 24, 2025
03fc646
Improve error message
aralyekta Apr 24, 2025
8531b8e
Change error status codes for jira/confluence/zendesk data retrieval …
aralyekta Apr 24, 2025
7d9eb8d
Fix response data parsing order
aralyekta Apr 24, 2025
df9d2df
Improve error message
aralyekta Apr 24, 2025
b55a041
Remove log
aralyekta Apr 24, 2025
a1c376e
Merge branch 'develop' of https://github.com/Gurubase/gurubase into t…
aralyekta Apr 28, 2025
092cc47
Reorder migrations
aralyekta Apr 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/gurubase-backend/backend/core/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def guru_type(self, obj):

@admin.register(GuruType)
class GuruTypeAdmin(admin.ModelAdmin):
list_display = ['id', 'slug', 'active', 'has_sitemap_added_questions', 'icon_url', 'stackoverflow_tag', 'domain_knowledge', 'colors', 'custom', 'maintainers_list', 'github_repos', 'text_embedding_model', 'code_embedding_model', 'date_created', 'date_updated', 'github_details_updated_date']
list_display = ['id', 'slug', 'active', 'has_sitemap_added_questions', 'icon_url', 'stackoverflow_tag', 'domain_knowledge', 'colors', 'custom', 'maintainers_list', 'text_embedding_model', 'code_embedding_model', 'date_created', 'date_updated']
search_fields = ['id', 'slug', 'icon_url', 'stackoverflow_tag', 'domain_knowledge', 'date_created', 'date_updated', 'maintainers__email']
list_filter = ('active', 'custom', 'has_sitemap_added_questions', 'text_embedding_model', 'code_embedding_model')
ordering = ('-id',)
Expand Down
37 changes: 21 additions & 16 deletions src/gurubase-backend/backend/core/data_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,38 +521,43 @@ def create(self, guru_type_object, ticket_url):
'message': str(e)
}

class GitHubRepoStrategy(DataSourceStrategy):
def create(self, guru_type_object, repo_url):
class GitHubStrategy(DataSourceStrategy):
def create(self, guru_type_object, repo):
url = repo['url']
glob_pattern = repo['glob_pattern']
glob_include = repo['include_glob']
try:
# Create the data source
data_source = DataSource.objects.create(
type=DataSource.Type.GITHUB_REPO,
guru_type=guru_type_object,
url=repo_url
url=url,
github_glob_pattern=glob_pattern,
github_glob_include=glob_include
)

data_source.save()

return {
'type': 'GITHUB_REPO',
'url': repo_url,
'type': 'GitHub',
'url': url,
'status': 'success',
'id': data_source.id,
'title': data_source.title
'title': data_source.title,
'github_glob_pattern': glob_pattern,
'github_glob_include': glob_include
}
except DataSourceExists as e:
return {
'type': 'GITHUB_REPO',
'url': repo_url,
'type': 'GitHub',
'url': url,
'status': 'exists',
'id': e.args[0]['id'],
'title': e.args[0]['title']
'title': e.args[0]['title'],
'github_glob_pattern': e.args[0]['github_glob_pattern'],
'github_glob_include': e.args[0]['github_glob_include']
}
except Exception as e:
logger.error(f'Error processing GitHub repository {repo_url}: {traceback.format_exc()}')
logger.error(f'Error processing GitHub repository {url}: {traceback.format_exc()}')
return {
'type': 'GITHUB_REPO',
'url': repo_url,
'type': 'GitHub',
'url': url,
'status': 'error',
'message': str(e)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.18 on 2025-04-21 14:44

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('core', '0080_settings_gurubase_url'),
]

operations = [
migrations.RemoveField(
model_name='gurutype',
name='github_repos',
),
migrations.RemoveField(
model_name='gurutype',
name='index_repo',
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.18 on 2025-04-24 08:55

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('core', '0081_remove_gurutype_github_repos_and_more'),
]

operations = [
migrations.AddField(
model_name='datasource',
name='github_details',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='datasource',
name='github_details_updated_date',
field=models.DateTimeField(blank=True, null=True),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Generated by Django 4.2.18 on 2025-04-24 09:34

from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
('core', '0082_datasource_github_details_and_more'),
]

operations = [
migrations.RemoveField(
model_name='gurutype',
name='github_details',
),
migrations.RemoveField(
model_name='gurutype',
name='github_details_updated_date',
),
]
51 changes: 26 additions & 25 deletions src/gurubase-backend/backend/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,6 @@ class EmbeddingModel(models.TextChoices):
name = models.CharField(max_length=50, blank=True, null=True)
maintainers = models.ManyToManyField(User, blank=True, related_name='maintained_guru_types')
stackoverflow_tag = models.CharField(max_length=100, blank=True, null=True)
github_repos = models.JSONField(default=list, blank=True)
github_details = models.JSONField(default=dict, blank=True, null=False)
github_details_updated_date = models.DateTimeField(null=True, blank=True)
colors = models.JSONField(default=dict, blank=True, null=False)
icon_url = models.CharField(max_length=2000, default="", blank=True, null=True)
ogimage_url = models.URLField(max_length=2000, default="", blank=True, null=True) # question
Expand All @@ -320,7 +317,6 @@ class EmbeddingModel(models.TextChoices):
domain_knowledge = models.TextField(default='', blank=True, null=True)
custom_instruction_prompt = models.TextField(default='', blank=True, null=True)
has_sitemap_added_questions = models.BooleanField(default=False)
index_repo = models.BooleanField(default=True)
# GitHub repository limits
github_repo_count_limit = models.IntegerField(default=1)
github_file_count_limit_per_repo_soft = models.IntegerField(default=1000) # Warning threshold
Expand Down Expand Up @@ -386,19 +382,12 @@ def save(self, *args, **kwargs):
if self.slug == '':
raise ValidationError({'msg': 'Guru type name cannot be empty'})

unique_github_repos = set(self.github_repos)

if settings.ENV != 'selfhosted' and len(unique_github_repos) > self.github_repo_count_limit:
raise ValidationError({'msg': f'You have reached the maximum number ({self.github_repo_count_limit}) of GitHub repositories for this guru type.'})

if settings.ENV == 'selfhosted':
if self.text_embedding_model == GuruType.EmbeddingModel.IN_HOUSE:
raise ValidationError({'msg': 'In-house embedding model is not allowed in selfhosted environment.'})
if self.code_embedding_model == GuruType.EmbeddingModel.IN_HOUSE:
raise ValidationError({'msg': 'In-house embedding model is not allowed in selfhosted environment.'})

self.github_repos = list(unique_github_repos)

super().save(*args, **kwargs)

def generate_widget_id(self, domain_url):
Expand Down Expand Up @@ -481,7 +470,7 @@ def ready(self):

return non_processed_count == 0 and non_written_count == 0

def check_datasource_limits(self, user, file=None, website_urls_count=0, youtube_urls_count=0, github_urls_count=0, jira_urls_count=0, zendesk_urls_count=0, confluence_urls_count=0):
def check_datasource_limits(self, user, file=None, website_urls_count=0, youtube_urls_count=0, jira_urls_count=0, zendesk_urls_count=0, confluence_urls_count=0, github_repos_count=0):
"""
Checks if adding a new datasource would exceed the limits for this guru type.
Returns (bool, str) tuple - (is_allowed, error_message)
Expand Down Expand Up @@ -510,11 +499,6 @@ def check_datasource_limits(self, user, file=None, website_urls_count=0, youtube
type=DataSource.Type.YOUTUBE
).count()

github_count = DataSource.objects.filter(
guru_type=self,
type=DataSource.Type.GITHUB_REPO
).count()

jira_count = DataSource.objects.filter(
guru_type=self,
type=DataSource.Type.JIRA
Expand All @@ -535,6 +519,12 @@ def check_datasource_limits(self, user, file=None, website_urls_count=0, youtube
guru_type=self,
type=DataSource.Type.PDF
)

github_repo_count = DataSource.objects.filter(
guru_type=self,
type=DataSource.Type.GITHUB_REPO
).count()

total_pdf_mb = 0
for source in pdf_sources:
if source.file:
Expand All @@ -549,7 +539,7 @@ def check_datasource_limits(self, user, file=None, website_urls_count=0, youtube
return False, f"YouTube video limit ({self.youtube_count_limit}) reached"

# Check GitHub repo limit
if (github_count + github_urls_count) > self.github_repo_count_limit:
if (github_repo_count + github_repos_count) > self.github_repo_count_limit:
return False, f"GitHub repository limit ({self.github_repo_count_limit}) reached"

# Check Jira issue limit
Expand All @@ -558,7 +548,7 @@ def check_datasource_limits(self, user, file=None, website_urls_count=0, youtube

# Check Zendesk ticket limit
if (zendesk_count + zendesk_urls_count) > self.zendesk_count_limit:
return False, f"Zendesk ticket limit ({self.zendesk_count_limit}) reached"
return False, f"Zendesk ticket/article limit ({self.zendesk_count_limit}) reached"

# Check Confluence page limit
if (confluence_count + confluence_urls_count) > self.confluence_count_limit:
Expand Down Expand Up @@ -639,7 +629,7 @@ class Type(models.TextChoices):
ZENDESK = "ZENDESK"
CONFLUENCE = "CONFLUENCE"

class Status(models.TextChoices):
class Status(models.TextChoices): # This is the data retrieval status. It is not the readiness of the data source as it may not be written to milvus yet.
NOT_PROCESSED = "NOT_PROCESSED"
SUCCESS = "SUCCESS"
FAIL = "FAIL"
Expand Down Expand Up @@ -680,14 +670,16 @@ class Status(models.TextChoices):
final_summarization_created = models.BooleanField(default=False)

default_branch = models.CharField(max_length=100, null=True, blank=True) # Only used for Github Repos
github_details = models.JSONField(default=dict, blank=True, null=True) # For storing GitHub repository details
github_details_updated_date = models.DateTimeField(null=True, blank=True) # When GitHub details were last updated

private = models.BooleanField(default=False)

last_reindex_date = models.DateTimeField(auto_now_add=True, null=True, blank=True)
last_reindex_date = models.DateTimeField(auto_now_add=True, null=True, blank=True) # Set when reindex is manually done
reindex_count = models.IntegerField(default=0)

scrape_tool = models.CharField(max_length=100, null=True, blank=True)
last_successful_index_date = models.DateTimeField(null=True, blank=True)
last_successful_index_date = models.DateTimeField(null=True, blank=True) # Set when github repo is indexed/reindexed successfully
github_glob_include = models.BooleanField(default=True)
github_glob_pattern = models.CharField(max_length=100, null=True, blank=True)

Expand Down Expand Up @@ -1109,9 +1101,18 @@ def reindex(self):

if self.type == DataSource.Type.GITHUB_REPO:
self.content = ''

self.save()
self.delete_from_milvus()
self.save()

# Import here to avoid circular imports
from core.tasks import update_github_repositories
# Delay the task to process this specific GitHub repository
update_github_repositories.delay(
guru_type_slug=self.guru_type.slug,
repo_url=self.url
)
else:
self.save()
self.delete_from_milvus()


class FeaturedDataSource(models.Model):
Expand Down
7 changes: 5 additions & 2 deletions src/gurubase-backend/backend/core/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from rest_framework import serializers
from core.models import WidgetId, Binge, DataSource, GuruType, Question, FeaturedDataSource, APIKey, Settings, CrawlState
from core.models import GithubFile, WidgetId, Binge, DataSource, GuruType, Question, FeaturedDataSource, APIKey, Settings, CrawlState
from core.gcp import replace_media_root_with_nginx_base_url
from django.conf import settings

Expand Down Expand Up @@ -73,7 +73,7 @@ class Meta:
class DataSourceSerializer(serializers.ModelSerializer):
class Meta:
model = DataSource
exclude = ['file', 'doc_ids', 'content', 'guru_type']
exclude = ['file', 'doc_ids', 'content', 'guru_type', 'last_reindex_date']

def to_representation(self, instance):
from core.utils import format_github_repo_error
Expand All @@ -88,6 +88,9 @@ def to_representation(self, instance):
if instance.type == DataSource.Type.GITHUB_REPO:
if instance.error:
repr['error'] = format_github_repo_error(instance.error, instance.user_error)

repr['file_count'] = GithubFile.objects.filter(data_source=instance, in_milvus=True).count()
repr['last_reindex_date'] = instance.last_successful_index_date

return repr

Expand Down
Loading