faraasat · LunarXhh · Dec 10, 2024
diff --git a/api/index.py b/api/index.py
@@ -1,13 +1,264 @@
-from flask import Flask, jsonify
+from flask import Flask, jsonify, request
+import requests
+from bs4 import BeautifulSoup
+import os
+import re
+import urllib.parse
+import time
+import random
+import base64
+from io import BytesIO
+from urllib.parse import urlparse
+import html2text
 
 app = Flask(__name__)
 
+def search_images(query, num_images=5):
+    # Headers to mimic a browser request
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+
+    # Format the query for URL
+    formatted_query = urllib.parse.quote(query)
+
+    # Google Images URL
+    url = f"https://www.google.com/search?q={formatted_query}&tbm=isch&safe=active"
+
+    try:
+        # Get the HTML content
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+
+        # Find all image URLs using regex
+        image_urls = re.findall(r'https?://[^"\']*?(?:jpg|jpeg|png|gif)', response.text)
+
+        # Remove duplicates while preserving order
+        image_urls = list(dict.fromkeys(image_urls))
+
+        # Store results
+        results = []
+        downloaded = 0
+
+        for img_url in image_urls:
+            if downloaded >= num_images:
+                break
+
+            try:
+                # Skip small thumbnails and icons
+                if 'gstatic.com' in img_url or 'google.com' in img_url:
+                    continue
+
+                # Download image
+                img_response = requests.get(img_url, headers=headers, timeout=10)
+                img_response.raise_for_status()
+
+                # Check if the response is actually an image
+                content_type = img_response.headers.get('Content-Type', '')
+                if not content_type.startswith('image/'):
+                    continue
+
+                # Convert image to base64
+                image_base64 = base64.b64encode(img_response.content).decode('utf-8')
+
+                # Add to results
+                results.append({
+                    'image_url': img_url,
+                    'base64_data': f"data:{content_type};base64,{image_base64}"
+                })
+
+                downloaded += 1
+
+                # Add a random delay between downloads
+                time.sleep(random.uniform(0.5, 1))
+
+            except Exception as e:
+                print(f"Error downloading image: {str(e)}")
+                continue
+
+        return results
+
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return []
+
+@app.route('/search_images', methods=['GET'])
+def api_search_images():
+    try:
+        # Get query parameters
+        query = request.args.get('query', '')
+        num_images = int(request.args.get('num_images', 5))
+
+        if not query:
+            return jsonify({'error': 'Query parameter is required'}), 400
+
+        if num_images < 1 or num_images > 20:
+            return jsonify({'error': 'Number of images must be between 1 and 20'}), 400
+
+        # Search for images
+        results = search_images(query, num_images)
+
+        return jsonify({
+            'success': True,
+            'query': query,
+            'results': results
+        })
+
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+
+def get_domain(url):
+    """Extract domain from URL"""
+    parsed_uri = urlparse(url)
+    return parsed_uri.netloc
+
+def clean_text(text):
+    """Clean scraped text"""
+    # Remove extra whitespace
+    text = re.sub(r'\s+', ' ', text)
+    # Remove special characters
+    text = re.sub(r'[^\w\s.,!?-]', '', text)
+    return text.strip()
+
+def scrape_website(url, headers):
+    """Scrape content from a single website"""
+    try:
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'footer', 'iframe']):
+            element.decompose()
+
+        # Convert HTML to text
+        h = html2text.HTML2Text()
+        h.ignore_links = True
+        h.ignore_images = True
+        text = h.handle(str(soup))
+
+        # Clean the text
+        text = clean_text(text)
+
+        # Get meta description
+        meta_desc = ''
+        meta_tag = soup.find('meta', attrs={'name': 'description'}) or soup.find('meta', attrs={'property': 'og:description'})
+        if meta_tag:
+            meta_desc = meta_tag.get('content', '')
+
+        # Get title
+        title = soup.title.string if soup.title else ''
+
+        return {
+            'title': clean_text(title),
+            'meta_description': clean_text(meta_desc),
+            'content': text[:1000],  # Limit content length
+            'url': url
+        }
+
+    except Exception as e:
+        print(f"Error scraping {url}: {str(e)}")
+        return None
+
+def search_and_scrape(query, num_results=5):
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language': 'en-US,en;q=0.5',
+        'Accept-Encoding': 'gzip, deflate',
+        'DNT': '1',
+        'Connection': 'keep-alive',
+    }
+
+    # Format the query for URL
+    formatted_query = urllib.parse.quote(query)
+
+    # Google Search URL
+    url = f"https://www.google.com/search?q={formatted_query}&num={num_results}"
+
+    try:
+        # Get Google search results
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, 'html.parser')
+
+        # Find all search result divs
+        search_results = []
+        result_divs = soup.find_all('div', class_='g')
+
+        for div in result_divs:
+            # Find the link
+            link = div.find('a')
+            if not link:
+                continue
+
+            href = link.get('href', '')
+
+            # Skip if not a valid URL or if it's a Google-related URL
+            if not href.startswith('http') or 'google.' in href:
+                continue
+
+            # Add random delay between requests
+            time.sleep(random.uniform(1, 2))
+
+            # Scrape the website
+            site_data = scrape_website(href, headers)
+            if site_data:
+                search_results.append(site_data)
+
+            if len(search_results) >= num_results:
+                break
+
+        return search_results
+
+    except Exception as e:
+        print(f"An error occurred: {str(e)}")
+        return []
+
+@app.route('/scrape_sites', methods=['GET'])
+def api_scrape_sites():
+    try:
+        # Get query parameters
+        query = request.args.get('query', '')
+        num_results = int(request.args.get('num_results', 5))
+
+        if not query:
+            return jsonify({'error': 'Query parameter is required'}), 400
+
+        if num_results < 1 or num_results > 10:
+            return jsonify({'error': 'Number of results must be between 1 and 10'}), 400
+
+        # Search and scrape sites
+        results = search_and_scrape(query, num_results)
+
+        return jsonify({
+            'success': True,
+            'query': query,
+            'results': results
+        })
+
+    except Exception as e:
+        return jsonify({
+            'success': False,
+            'error': str(e)
+        }), 500
+
+if __name__ == "__main__":
+    app.run(debug=True, port=5000)
+
+
+
+
 
-@app.route("/")
-def home():
-    return "Flask Vercel Example - Hello World", 200
 
 
-@app.errorhandler(404)
-def page_not_found(e):
-    return jsonify({"status": 404, "message": "Not Found"}), 404