From 51257089ea77c2eed540a89e4d5e00287e04b950 Mon Sep 17 00:00:00 2001 From: "K.Y" Date: Mon, 29 Apr 2024 01:02:59 +0800 Subject: [PATCH 1/4] Update render.yaml (#125) --- render.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/render.yaml b/render.yaml index b70c531..67cbca3 100644 --- a/render.yaml +++ b/render.yaml @@ -1,6 +1,6 @@ services: - name: openai-forward type: web - env: docker + runtime: docker region: oregon - plan: free \ No newline at end of file + plan: free From d4fe3b85cb9b33f19c33935fb6876a3fcbebd744 Mon Sep 17 00:00:00 2001 From: "K.Y" Date: Wed, 15 May 2024 01:19:30 +0800 Subject: [PATCH 2/4] feat: Support more flexible configuration options (#129) * feat: Support more flexible configuration options * Add gpt4o model * Feat: Now in general mode, model level control is also supported --- .env | 20 ++- .env.example | 6 +- .github/workflows/docs.yml | 49 ++++++ .gitignore | 1 - README.md | 41 +---- deploy.md | 4 + openai_forward/__init__.py | 17 +- openai_forward/__main__.py | 27 ++- openai_forward/cache/chat/chat_completions.py | 14 +- openai_forward/cache/chat/response.py | 6 +- openai_forward/config/__init__.py | 0 openai_forward/{webui => config}/interface.py | 36 +++- openai_forward/console.py | 2 + openai_forward/content/openai.py | 4 +- openai_forward/decorators.py | 76 +++++++++ openai_forward/forward/core.py | 158 ++++++++++-------- openai_forward/settings.py | 32 +++- openai_forward/webui/chat.py | 46 +++-- openai_forward/webui/run.py | 67 ++++---- 19 files changed, 425 insertions(+), 181 deletions(-) create mode 100644 .github/workflows/docs.yml create mode 100644 openai_forward/config/__init__.py rename openai_forward/{webui => config}/interface.py (82%) diff --git a/.env b/.env index dc6fa15..4627396 100644 --- a/.env +++ b/.env @@ -8,6 +8,8 @@ LOG_OPENAI=true CACHE_GENERAL=true CACHE_OPENAI=true +CHAT_COMPLETION_ROUTE=/v1/chat/completions +#CUSTOM_GENERAL_ROUTE=/v1/models/gemini-pro CACHE_ROUTES=["/v1/chat/completions","/v1/embeddings"] # `CACHE_BACKEND`: Options (MEMORY, LMDB, LevelDB) @@ -16,18 +18,21 @@ CACHE_BACKEND=MEMORY DEFAULT_REQUEST_CACHING_VALUE=false -#BENCHMARK_MODE=true +BENCHMARK_MODE=true FORWARD_CONFIG=[{"base_url":"https://api.openai.com","route":"/","type":"openai"}] #LEVEL_MODELS={"1": ["gpt-4"], "2": ["gpt-3.5-turbo"]} #OPENAI_API_KEY={"sk-xxx": [0], "sk-xxx": [1], "sk-xxx": [1,2]} -#FORWARD_KEY={"fk-0": 0, "fk-1": 1, "fk-2": 2} +#FORWARD_KEY={"fk-0": 0, "fk-1": 1, "fk-2": 2, "default": 1} # `REQ_RATE_LIMIT`: i.e., Request rate limit for specified routes, user specific # format: {route: ratelimit-string} # ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation -REQ_RATE_LIMIT={"/v1/chat/completions":"100/2minutes","/v1/completions":"60/minute;600/hour"} +REQ_RATE_LIMIT='{ +"/v1/chat/completions":[{"level":0,"limit":"100/2minutes"}], +"/v1/completions":[{"level":0,"limit":"60/minute;600/hour"}] +}' # Backend for rate limiting: [memory, redis, memcached, ...] :ref: https://limits.readthedocs.io/en/stable/storage.html# #REQ_RATE_LIMIT_BACKEND=redis://localhost:6379 @@ -40,7 +45,11 @@ GLOBAL_RATE_LIMIT=200/minute RATE_LIMIT_STRATEGY=moving-window # Rate limit for returned tokens -TOKEN_RATE_LIMIT={"/v1/chat/completions":"60/second","/v1/completions":"60/second"} +TOKEN_RATE_LIMIT='{ +"/v1/chat/completions":[{"level":0,"limit":"60/second"}], +"/v1/completions":[{"level":0,"limit":"60/second"}], +"/benchmark/v1/chat/completions":[{"level":0,"limit":"20/second"}] +}' # TCP connection timeout duration (in seconds) TIMEOUT=6 @@ -49,6 +58,9 @@ ITER_CHUNK_TYPE=one-by-one #ITER_CHUNK_TYPE=efficiency #IP_BLACKLIST= +WEBUI_RESTART_PORT=15555 +WEBUI_LOG_PORT=15556 +DEFAULT_STREAM_RESPONSE=true # Set timezone TZ=Asia/Shanghai diff --git a/.env.example b/.env.example index 6456b72..e5ac51a 100644 --- a/.env.example +++ b/.env.example @@ -6,7 +6,7 @@ LOG_OPENAI=true CACHE_GENERAL=true CACHE_OPENAI=true - +CHAT_COMPLETION_ROUTE=/v1/chat/completions # `CACHE_BACKEND`: Options (MEMORY, LMDB, LevelDB) CACHE_BACKEND=LMDB CACHE_ROOT_PATH_OR_URL=./FLAXKV_DB @@ -55,5 +55,9 @@ TIMEOUT=10 ITER_CHUNK_TYPE=efficiency #ITER_CHUNK_TYPE=one-by-one +WEBUI_RESTART_PORT=15555 +WEBUI_LOG_PORT=15556 + +DEFAULT_STREAM_RESPONSE=true # Set timezone TZ=Asia/Shanghai diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..417d0c4 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,49 @@ +name: website + +# build the documentation whenever there are new commits on main +on: + push: + branches: + - main + # Alternative: only build for tags. + # tags: + # - '*' + +# security: restrict permissions for CI jobs. +permissions: + contents: read + +jobs: + # Build the documentation and upload the static HTML files as an artifact. + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + + # ADJUST THIS: install all dependencies (including pdoc) + - run: pip install -e . + # ADJUST THIS: build your documentation into docs/. + # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here. + - run: python docs/make.py + + - uses: actions/upload-pages-artifact@v3 + with: + path: docs/ + + # Deploy the artifact to GitHub pages. + # This is a separate job so that only actions/deploy-pages has the necessary permissions. + deploy: + needs: build + runs-on: ubuntu-latest + permissions: + pages: write + id-token: write + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + steps: + - id: deployment + uses: actions/deploy-pages@v4 \ No newline at end of file diff --git a/.gitignore b/.gitignore index a90977c..62f1a65 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ .github/release-template.ejs .github/workflows/doc.yml -scripts/release.sh node_modules package-lock.json package.json diff --git a/README.md b/README.md index 5b5028c..0ae1127 100644 --- a/README.md +++ b/README.md @@ -163,37 +163,6 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ) ``` -
- 更多 - -#### 在三方应用中使用 - -基于开源项目[ChatGPT-Next-Web](https://github.com/Yidadaa/ChatGPT-Next-Web)中接入: -替换docker启动命令中的 `BASE_URL`为自己搭建的代理服务地址 - -```bash -docker run -d \ - -p 3000:3000 \ - -e OPENAI_API_KEY="sk-******" \ - -e BASE_URL="https://api.openai-forward.com" \ - -e CODE="******" \ - yidadaa/chatgpt-next-web -``` - -**Image Generation (DALL-E)** - -```bash -curl --location 'https://api.openai-forward.com/v1/images/generations' \ ---header 'Authorization: Bearer sk-******' \ ---header 'Content-Type: application/json' \ ---data '{ - "prompt": "A photo of a cat", - "n": 1, - "size": "512x512" -}' -``` - -
### 代理本地模型 @@ -207,7 +176,15 @@ curl --location 'https://api.openai-forward.com/v1/images/generations' \ (更多) -### 代理其它云端模型 +### 代理任意云端模型 + +#### 代理[gemini pro](https://ai.google.dev/) +配置环境变量或 .env 文件如下: +```env +FORWARD_CONFIG=[{"base_url":"https://generativelanguage.googleapis.com","route":"/gemini","type":"general"}] +``` +说明:`aidf run`启动后,即可通过访问 http://localhost:8000/gemini 使用gemini pro。 + - **场景1:** 使用通用转发,可对任意来源服务进行转发, diff --git a/deploy.md b/deploy.md index 7459e80..62f04cc 100644 --- a/deploy.md +++ b/deploy.md @@ -46,6 +46,10 @@ pip install openai-forward ```bash aifd run ``` +或运行webui +```bash +aifd run --webui +``` 服务就搭建完成了。 配置见[配置](README.md#配置) diff --git a/openai_forward/__init__.py b/openai_forward/__init__.py index a2258e1..039ea32 100644 --- a/openai_forward/__init__.py +++ b/openai_forward/__init__.py @@ -1,5 +1,20 @@ -__version__ = "0.7.2" +__version__ = "0.8.0-alpha" from dotenv import load_dotenv +from yaml import load + +def yaml_load(filepath): + + try: + from yaml import CLoader as Loader + except ImportError: + from yaml import Loader + with open(filepath, mode='r', encoding="utf-8") as stream: + # stream = stream.read() + content = load(stream, Loader=Loader) + return content + + +# yaml_load() load_dotenv('.env', override=False) diff --git a/openai_forward/__main__.py b/openai_forward/__main__.py index 24c0f6c..c3fe16d 100644 --- a/openai_forward/__main__.py +++ b/openai_forward/__main__.py @@ -66,15 +66,15 @@ def run(self, port=8000, workers=1, webui=False, start_ui=True, ui_port=8001): import zmq - mq_port = 15555 - os.environ['OPENAI_FORWARD_WEBUI'] = 'true' context = zmq.Context() socket = context.socket(zmq.REP) - socket.bind(f"tcp://*:{mq_port}") + restart_port = int(os.environ.get('WEBUI_RESTART_PORT', 15555)) + socket.bind(f"tcp://*:{restart_port}") log_socket = context.socket(zmq.ROUTER) - log_socket.bind(f"tcp://*:{15556}") + log_port = int(os.environ.get("WEBUI_LOG_PORT", 15556)) + log_socket.bind(f"tcp://*:{log_port}") subscriber_info = {} def mq_worker(log_socket: zmq.Socket): @@ -227,6 +227,25 @@ def convert(log_folder: str = None, target_path: str = None): convert_folder_to_jsonl(log_folder, target_path) print(60 * '-') + @staticmethod + def gen_config(dir: str = "."): + """ + Generates a .env file in the specified directory. + """ + from pathlib import Path + + from openai_forward.config.interface import Config + + config = Config() + env_dict = config.convert_to_env(set_env=False) + dir = Path(dir) + + with open(dir / ".env", "w") as f: + env_content = "\n".join( + [f"{key}={value}" for key, value in env_dict.items()] + ) + f.write(env_content) + def main(): fire.Fire(Cli) diff --git a/openai_forward/cache/chat/chat_completions.py b/openai_forward/cache/chat/chat_completions.py index 3827e3c..e6c7511 100644 --- a/openai_forward/cache/chat/chat_completions.py +++ b/openai_forward/cache/chat/chat_completions.py @@ -9,9 +9,13 @@ from fastapi import Request from fastapi.responses import Response, StreamingResponse -from ...decorators import async_random_sleep, async_token_rate_limit +from ...decorators import ( + async_random_sleep, + async_token_rate_limit_auth_level, + random_sleep, +) from ...helper import get_unique_id -from ...settings import token_interval_conf +from ...settings import FWD_KEY, token_interval_conf from .tokenizer import TIKTOKEN_VALID, count_tokens, encode_as_pieces @@ -118,7 +122,7 @@ class ChatCompletionsResponse: sentences = cycle(corpus) -@async_token_rate_limit(token_interval_conf) +@async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY) async def stream_generate( model: str, content: str | None, tool_calls: list | None, request: Request ): @@ -206,7 +210,7 @@ def serialize_delta( yield b'data: [DONE]\n\n' -@async_token_rate_limit(token_interval_conf) +@async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY) async def stream_generate_efficient( model: str, content: str | None, tool_calls: list | None, request: Request ): @@ -292,6 +296,7 @@ def serialize_delta( yield b'data: [DONE]\n\n' +@random_sleep(min_time=1, max_time=2) def generate(model: str, content: str | None, tool_calls: list | None, usage: dict): created = int(time.time()) id = f"chatcmpl-{get_unique_id()}" @@ -336,7 +341,6 @@ def model_inference(model: str, messages: List): return ModelInferResult(content=sentence, usage=usage) -@async_random_sleep(min_time=0, max_time=1) async def chat_completions_benchmark(request: Request): payload = await request.json() model = payload.get("model", 'robot') diff --git a/openai_forward/cache/chat/response.py b/openai_forward/cache/chat/response.py index 46f759b..7b92c62 100644 --- a/openai_forward/cache/chat/response.py +++ b/openai_forward/cache/chat/response.py @@ -7,10 +7,10 @@ from flaxkv.pack import encode from loguru import logger -from ...settings import CACHE_OPENAI +from ...settings import CACHE_OPENAI, FWD_KEY from ..database import db_dict from .chat_completions import ( - async_token_rate_limit, + async_token_rate_limit_auth_level, generate, stream_generate_efficient, token_interval_conf, @@ -116,7 +116,7 @@ def get_cached_chat_response(payload_info, valid_payload, request, **kwargs): return None, cache_key -@async_token_rate_limit(token_interval_conf) +@async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY) async def stream_generate(buffer_list: List, request): for buffer in buffer_list: yield buffer diff --git a/openai_forward/config/__init__.py b/openai_forward/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/openai_forward/webui/interface.py b/openai_forward/config/interface.py similarity index 82% rename from openai_forward/webui/interface.py rename to openai_forward/config/interface.py index eb01781..32a122d 100644 --- a/openai_forward/webui/interface.py +++ b/openai_forward/config/interface.py @@ -4,7 +4,7 @@ from attrs import asdict, define, field, filters -from openai_forward.settings import * +from ..settings import * class Base: @@ -13,6 +13,9 @@ def to_dict(self, drop_none=True): return asdict(self, filter=filters.exclude(type(None))) return asdict(self) + def to_dict_str(self): + return {k: str(v) for k, v in self.to_dict(drop_none=True).items()} + @define(slots=True) class ForwardItem(Base): @@ -71,7 +74,7 @@ def convert_to_env(self, set_env=False): @define(slots=True) class RateLimitType(Base): route: str - value: str + value: List[Dict[str, str]] @define(slots=True) @@ -79,13 +82,25 @@ class RateLimit(Base): backend: str = '' global_rate_limit: str = 'inf' token_rate_limit: List[RateLimitType] = [ - RateLimitType(route="/v1/chat/completions", value="60/second"), - RateLimitType(route="/v1/completions", value="60/second"), + RateLimitType( + route="/v1/chat/completions", + value=[{"level": '0', "rate_limit": "60/second"}], + ), + RateLimitType( + route="/v1/completions", value=[{"level": '0', "rate_limit": "60/second"}] + ), ] req_rate_limit: List[RateLimitType] = [ - RateLimitType(route="/v1/chat/completions", value="100/2minutes"), - RateLimitType(route="/v1/completions", value="60/minute"), - RateLimitType(route="/v1/embeddings", value="100/2minutes"), + RateLimitType( + route="/v1/chat/completions", + value=[{"level": '0', "rate_limit": "100/2minutes"}], + ), + RateLimitType( + route="/v1/completions", value=[{"level": '0', "rate_limit": "60/minute"}] + ), + RateLimitType( + route="/v1/embeddings", value=[{"level": '0', "rate_limit": "100/2minutes"}] + ), ] iter_chunk: Literal['one-by-one', 'efficiency'] = 'one-by-one' strategy: Literal[ @@ -159,6 +174,9 @@ class Config(Base): timeout: int = 6 benchmark_mode: bool = False proxy: str = '' + webui_restart_port: int = 15555 + webui_log_port: int = 15556 + default_stream_response: bool = True def convert_to_env(self, set_env=False): env_dict = {} @@ -171,6 +189,10 @@ def convert_to_env(self, set_env=False): env_dict['TZ'] = self.timezone env_dict['TIMEOUT'] = str(self.timeout) env_dict['BENCHMARK_MODE'] = str(self.benchmark_mode) + env_dict['WEBUI_RESTART_PORT'] = str(self.webui_restart_port) + env_dict['WEBUI_LOG_PORT'] = str(self.webui_log_port) + env_dict['DEFAULT_STREAM_RESPONSE'] = str(self.default_stream_response) + if self.proxy: env_dict['PROXY'] = self.proxy diff --git a/openai_forward/console.py b/openai_forward/console.py index 8071b71..e86dd2e 100644 --- a/openai_forward/console.py +++ b/openai_forward/console.py @@ -1,3 +1,5 @@ +from typing import List + from rich import print from rich.panel import Panel from rich.table import Table diff --git a/openai_forward/content/openai.py b/openai_forward/content/openai.py index ee27e59..4e05f40 100644 --- a/openai_forward/content/openai.py +++ b/openai_forward/content/openai.py @@ -29,7 +29,8 @@ def __init__(self, route_prefix: str, _suffix: str): context = zmq.Context() socket = context.socket(zmq.DEALER) - socket.connect("tcp://localhost:15556") + webui_log_port = int(os.environ.get("WEBUI_LOG_PORT", 15556)) + socket.connect(f"tcp://localhost:{webui_log_port}") self.q = SimpleQueue(maxsize=200) @@ -77,7 +78,6 @@ def __init__(self, route_prefix: str): def parse_payload(request: Request, raw_payload): uid = get_unique_id() payload = orjson.loads(raw_payload) - print(f"{payload=}") content = { "prompt": payload['prompt'], diff --git a/openai_forward/decorators.py b/openai_forward/decorators.py index 9775ff7..e019224 100644 --- a/openai_forward/decorators.py +++ b/openai_forward/decorators.py @@ -169,6 +169,59 @@ async def wrapper(*args, **kwargs): return decorator +def async_token_rate_limit_auth_level(token_rate_limit: dict, key_level_map: dict): + """ + A decorator for rate-limiting requests based on tokens. It limits the rate at which tokens can be consumed + for a particular route path. + + Args: + token_rate_limit (dict): A dictionary mapping route paths to their respective token intervals (in seconds). + + Yields: + value: The value from the wrapped asynchronous generator. + + Note: + The 'request' object should be passed either as a keyword argument or as a positional argument to the + decorated function. + """ + + def decorator(async_gen_func): + @wraps(async_gen_func) + async def wrapper(*args, **kwargs): + request: Request = kwargs.get('request') + if not request: + # Try to find the request argument by position + func_argspec = inspect.getfullargspec(async_gen_func) + request_index = func_argspec.args.index('request') + request = args[request_index] + + route_path = f"{request.scope.get('root_path')}{request.scope.get('path')}" + fk_or_sk = request.headers.get("Authorization", "default") + level = key_level_map.get(fk_or_sk, 0) + default_interval = 0 + token_level_dict = token_rate_limit.get( + route_path, {level: default_interval} + ) + token_interval = token_level_dict[level] + + async_gen = async_gen_func(*args, **kwargs) + + start_time = time.perf_counter() + async for value in async_gen: + if token_interval > 0: + current_time = time.perf_counter() + delta = current_time - start_time + delay = token_interval - delta + if delay > 0: + await asyncio.sleep(delay) + start_time = time.perf_counter() + yield value + + return wrapper + + return decorator + + def async_random_sleep(min_time=0, max_time=1): """ Decorator that adds a random sleep time between min_time and max_time. @@ -190,3 +243,26 @@ async def wrapper(*args, **kwargs): return wrapper return decorator + + +def random_sleep(min_time=0, max_time=1): + """ + Decorator that adds a random sleep time between min_time and max_time. + + Args: + min_time (float, optional): The minimum sleep time in seconds. + max_time (float, optional): The maximum sleep time in seconds. + """ + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + if max_time == 0: + return func(*args, **kwargs) + sleep_time = random.uniform(min_time, max_time) + time.sleep(sleep_time) + return func(*args, **kwargs) + + return wrapper + + return decorator diff --git a/openai_forward/forward/core.py b/openai_forward/forward/core.py index a39b748..5edc2c6 100644 --- a/openai_forward/forward/core.py +++ b/openai_forward/forward/core.py @@ -25,8 +25,8 @@ EmbeddingLogger, WhisperLogger, ) -from ..decorators import async_retry, async_token_rate_limit -from ..helper import InfiniteSet +from ..decorators import async_retry, async_token_rate_limit_auth_level +from ..helper import InfiniteSet, get_client_ip, get_unique_id from ..settings import * # from beartype import beartype @@ -41,6 +41,21 @@ class GenericForward: validate_host = bool(IP_BLACKLIST or IP_WHITELIST) timeout = aiohttp.ClientTimeout(connect=TIMEOUT) + _fk_to_level = FWD_KEY + _sk_to_levels = OPENAI_API_KEY + _level_to_model_set = {level: set(models) for level, models in LEVEL_MODELS.items()} + + _zero_level_model_set = InfiniteSet() + + _level_to_sks = {} + for sk, levels in _sk_to_levels.items(): + for level in levels: + _level_to_sks[level] = _level_to_sks.get(level, []) + [sk] + + _level_to_sk = {} + for level, sks in _level_to_sks.items(): + _level_to_sk[level] = cycle(sks) + def __init__(self, base_url: str, route_prefix: str, proxy=None): """ Args: @@ -53,6 +68,51 @@ def __init__(self, base_url: str, route_prefix: str, proxy=None): self.ROUTE_PREFIX = route_prefix self.client: aiohttp.ClientSession | None = None + @classmethod + def fk_to_sk(cls, forward_key: str): + """ + Convert a forward key to a secret key. + + Args: + forward_key (str): The forward key to convert. + + Returns: + str: The corresponding secret key, if it exists. Otherwise, None. + """ + level = cls._fk_to_level.get(forward_key) + if level is not None: + sk = cls._level_to_sk.get(level) + if sk: + return next(sk), level + return None, level + + @classmethod + def handle_authorization(cls, client_config): + """ + Handle the authorization for the client. + + Args: + client_config (dict): The configuration for the client. + + Returns: + str: The authorization string. + set: The set of models can be accessed. + """ + auth, auth_prefix = client_config["auth"], "Bearer " + model_set = cls._zero_level_model_set + + if auth: + fk = auth[len(auth_prefix) :] + if fk in FWD_KEY: + sk, level = cls.fk_to_sk(fk) + assert level is not None + if level != 0: + model_set = cls._level_to_model_set[level] + if sk: + auth = auth_prefix + sk + client_config["headers"]["Authorization"] = auth + return auth, model_set + async def build_client(self): """ Asynchronously build the client for making requests. @@ -84,7 +144,7 @@ def validate_request_host(ip): ) @staticmethod - @async_token_rate_limit(token_interval_conf) + @async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY) async def aiter_bytes( r: aiohttp.ClientResponse, request: Request, @@ -202,8 +262,8 @@ def prepare_client(self, request: Request, return_origin_header=False) -> dict: dict: The configuration for the client. """ assert self.BASE_URL and self.ROUTE_PREFIX + ip = get_client_ip(request) if self.validate_host: - ip = get_client_ip(request) self.validate_request_host(ip) _url_path = f"{request.scope.get('root_path')}{request.scope.get('path')}" @@ -248,9 +308,31 @@ def prepare_client(self, request: Request, return_origin_header=False) -> dict: 'headers': headers, "method": request.method, 'url': url, + 'ip': ip, 'route_path': route_path, } + async def _handle_payload(self, request: Request, route_path: str, model_set): + + if not request.method == "POST": + return + + if route_path in ( + CHAT_COMPLETION_ROUTE, + COMPLETION_ROUTE, + EMBEDDING_ROUTE, + CUSTOM_GENERAL_ROUTE, + ): + payload = await request.json() + model = payload.get("model", None) + + if model is not None and model not in model_set: + logger.warning(f"[Auth Warning] model: {model} is not allowed") + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail=f"model: {model} is not allowed", + ) + async def reverse_proxy(self, request: Request): """ Asynchronously handle reverse proxying the incoming request. @@ -267,9 +349,11 @@ async def reverse_proxy(self, request: Request): if LOG_GENERAL: logger.debug(f"payload: {data}") client_config = self.prepare_client(request, return_origin_header=True) - route_path = client_config["route_path"] + _, model_set = self.handle_authorization(client_config) + payload = await self._handle_payload(request, route_path, model_set) + cached_response, cache_key = get_cached_generic_response( data, request, route_path ) @@ -293,21 +377,6 @@ class OpenaiForward(GenericForward): Inherits from the GenericForward class and adds specific functionality for the OpenAI API. """ - _fk_to_level = FWD_KEY - _sk_to_levels = OPENAI_API_KEY - _level_to_model_set = {level: set(models) for level, models in LEVEL_MODELS.items()} - - _zero_level_model_set = InfiniteSet() - - _level_to_sks = {} - for sk, levels in _sk_to_levels.items(): - for level in levels: - _level_to_sks[level] = _level_to_sks.get(level, []) + [sk] - - _level_to_sk = {} - for level, sks in _level_to_sks.items(): - _level_to_sk[level] = cycle(sks) - def __init__(self, base_url: str, route_prefix: str, proxy=None): """ Initialize the OpenaiForward class. @@ -324,24 +393,6 @@ def __init__(self, base_url: str, route_prefix: str, proxy=None): self.whisper_logger = WhisperLogger(self.ROUTE_PREFIX) self.embedding_logger = EmbeddingLogger(self.ROUTE_PREFIX) - @classmethod - def fk_to_sk(cls, forward_key: str): - """ - Convert a forward key to a secret key. - - Args: - forward_key (str): The forward key to convert. - - Returns: - str: The corresponding secret key, if it exists. Otherwise, None. - """ - level = cls._fk_to_level.get(forward_key) - if level is not None: - sk = cls._level_to_sk.get(level) - if sk: - return next(sk), level - return None, level - def _handle_result( self, buffer: bytearray, uid: str, route_path: str, request_method: str ): @@ -449,7 +500,6 @@ async def _handle_payload(self, request: Request, route_path: str, model_set): if valid: if payload_log_info["model"] not in model_set: - print(f"{model_set=}") logger.warning( f"[Auth Warning] model: {payload_log_info['model']} is not allowed" ) @@ -487,7 +537,7 @@ async def read_chunks(r: aiohttp.ClientResponse, queue): await queue.put(buffer) # add all information when the stream ends - @async_token_rate_limit(token_interval_conf) + @async_token_rate_limit_auth_level(token_interval_conf, FWD_KEY) async def aiter_bytes( self, r: aiohttp.ClientResponse, @@ -568,32 +618,6 @@ async def aiter_bytes( else: logger.warning(f'uid: {uid}\n' f'{r.status}') - def handle_authorization(self, client_config): - """ - Handle the authorization for the client. - - Args: - client_config (dict): The configuration for the client. - - Returns: - str: The authorization string. - set: The set of models can be accessed. - """ - auth, auth_prefix = client_config["auth"], "Bearer " - model_set = self._zero_level_model_set - - if auth: - fk = auth[len(auth_prefix) :] - if fk in FWD_KEY: - sk, level = self.fk_to_sk(fk) - assert level is not None - if level != 0: - model_set = self._level_to_model_set[level] - if sk: - auth = auth_prefix + sk - client_config["headers"]["Authorization"] = auth - return auth, model_set - async def reverse_proxy(self, request: Request): """ Asynchronously handles reverse proxying the incoming request. @@ -612,7 +636,7 @@ async def reverse_proxy(self, request: Request): request, route_path, model_set ) uid = payload_info["uid"] - stream = payload_info.get('stream', None) + stream = payload_info.get('stream', DEFAULT_STREAM_RESPONSE) cached_response, cache_key = get_cached_response( payload, diff --git a/openai_forward/settings.py b/openai_forward/settings.py index 3f5dc60..ed6c5c0 100644 --- a/openai_forward/settings.py +++ b/openai_forward/settings.py @@ -12,15 +12,20 @@ general_additional_start_info = {} TIMEOUT = float(os.environ.get("TIMEOUT", "").strip() or "10") +DEFAULT_STREAM_RESPONSE = ( + os.environ.get("DEFAULT_STREAM_RESPONSE", "True").strip().lower() == "true" +) ITER_CHUNK_TYPE = ( os.environ.get("ITER_CHUNK_TYPE", "").strip() or "efficiency" ) # Options: efficiency, one-by-one -CHAT_COMPLETION_ROUTE = "/v1/chat/completions" -COMPLETION_ROUTE = "/v1/completions" -EMBEDDING_ROUTE = "/v1/embeddings" - +CHAT_COMPLETION_ROUTE = ( + os.environ.get("CHAT_COMPLETION_ROUTE", "/v1/chat/completions").strip().lower() +) +COMPLETION_ROUTE = os.environ.get("COMPLETION_ROUTE", "/v1/completions").strip().lower() +EMBEDDING_ROUTE = os.environ.get("EMBEDDING_ROUTE", "/v1/embeddings").strip().lower() +CUSTOM_GENERAL_ROUTE = os.environ.get("CUSTOM_GENERAL_ROUTE", "").strip().lower() CACHE_ROUTE_SET = set(env2dict("CACHE_ROUTES", [])) @@ -133,14 +138,21 @@ def get_limiter_key(request: Request): limiter_prefix = f"{request.scope.get('root_path')}{request.scope.get('path')}" - key = f"{limiter_prefix}-{get_client_ip(request)}" + fk_or_sk = request.headers.get("Authorization", "default") + key = f"{limiter_prefix},{fk_or_sk}" return key def dynamic_request_rate_limit(key: str): + limite_prefix, fk_or_sk = key.split(',') + key_level = FWD_KEY.get(fk_or_sk, 0) for route in req_rate_limit_dict: if key.startswith(route): - return req_rate_limit_dict[route] + for level_dict in req_rate_limit_dict[route]: + if level_dict['level'] == key_level: + return level_dict['limit'] + + break return GLOBAL_RATE_LIMIT @@ -157,8 +169,12 @@ def cvt_token_rate_to_interval(token_rate_limit: str): token_rate_limit_conf = env2dict("TOKEN_RATE_LIMIT") token_interval_conf = {} -for route, rate_limit in token_rate_limit_conf.items(): - token_interval_conf[route] = cvt_token_rate_to_interval(rate_limit) +for route, rate_limit_list in token_rate_limit_conf.items(): + token_interval_conf.setdefault(route, {}) + for level_dict in rate_limit_list: + token_interval_conf[route][level_dict['level']] = cvt_token_rate_to_interval( + level_dict['limit'] + ) styles = itertools.cycle( ["#7CD9FF", "#BDADFF", "#9EFFE3", "#f1b8e4", "#F5A88E", "#BBCA89"] diff --git a/openai_forward/webui/chat.py b/openai_forward/webui/chat.py index 74353f7..e9d58ad 100644 --- a/openai_forward/webui/chat.py +++ b/openai_forward/webui/chat.py @@ -1,37 +1,49 @@ -from collections import deque +from collections import OrderedDict, deque from typing import Callable, Dict import streamlit as st -def render_chat_log_message(msg: Dict): +def render_chat_log_message(msg: Dict, markdown=True): msg = msg.copy() + render = st.markdown if markdown else st.text if msg.get("user_role"): - with st.chat_message(name="human"): - messages = msg.pop('messages') + messages = msg.pop('messages') + with st.chat_message(name="user", avatar='🧑'): for msg_item in messages: # https://github.com/streamlit/streamlit/issues/7978 - # st.write(f"`{msg_item['role']}`: {msg_item['content']}") - st.text(f"`{msg_item['role']}`: {msg_item['content']}") + render(f"`{msg_item['role']}`: {msg_item['content']}") st.write(msg) elif msg.get("assistant_role"): - with st.chat_message(name="ai"): + with st.chat_message(name="assistant", avatar='🤖'): ass_content = msg.pop('assistant', None) - st.write(ass_content) + render(ass_content) st.write(msg) else: print(f"{msg=}") class ChatData: - def __init__(self, max_len: int, callback: Callable): - self.data = deque(maxlen=max_len) - self.callback = callback + def __init__(self, max_len: int): + self.data = OrderedDict() + self.max_len = max_len - def add_message(self, message): - self.data.append(message) - self.callback(message) + def add_message(self, message, **callback_kwargs): + uid = message.pop('uid') + if message.get("user_role"): + self.data.setdefault(uid, {'user_role': message}) + else: + msg_item = self.data.get(uid, {}) + msg_item['assistant_role'] = message + if len(self.data) >= self.max_len: + self.data.popitem(last=False) + render_chat_log_message(message, **callback_kwargs) - def render_all_messages(self): - for message in self.data: - self.callback(message) + def render_all_messages(self, **callback_kwargs): + for uid, msg in self.data.items(): + message = msg.get("user_role") + if message: + render_chat_log_message(message, **callback_kwargs) + message = msg.get("assistant_role") + if message: + render_chat_log_message(message, **callback_kwargs) diff --git a/openai_forward/webui/run.py b/openai_forward/webui/run.py index e8d66b1..0dfd1e0 100644 --- a/openai_forward/webui/run.py +++ b/openai_forward/webui/run.py @@ -1,7 +1,6 @@ -import os +import ast import pickle import threading -import time import orjson import pandas as pd @@ -9,9 +8,9 @@ import zmq from flaxkv.helper import SimpleQueue -from openai_forward.webui.chat import ChatData, render_chat_log_message +from openai_forward.config.interface import * +from openai_forward.webui.chat import ChatData from openai_forward.webui.helper import mermaid -from openai_forward.webui.interface import * st.set_page_config( page_title="Openai Forward ", @@ -32,10 +31,12 @@ def get_global_vars(): socket = context.socket(zmq.REQ) # socket.setsockopt(zmq.CONNECT_TIMEOUT, 20000) # 20s openai_forward_host = os.environ.get("OPENAI_FORWARD_HOST", "localhost") - socket.connect(f"tcp://{openai_forward_host}:15555") + restart_port = int(os.environ.get('WEBUI_RESTART_PORT', 15555)) + socket.connect(f"tcp://{openai_forward_host}:{restart_port}") log_socket = context.socket(zmq.DEALER) - log_socket.connect(f"tcp://{openai_forward_host}:15556") + webui_log_port = int(os.environ.get("WEBUI_LOG_PORT", 15556)) + log_socket.connect(f"tcp://{openai_forward_host}:{webui_log_port}") log_socket.send_multipart([b"/subscribe", b"0"]) def worker(log_socket: zmq.Socket, q: SimpleQueue): @@ -47,7 +48,7 @@ def worker(log_socket: zmq.Socket, q: SimpleQueue): q = SimpleQueue(maxsize=100) threading.Thread(target=worker, args=(log_socket, q)).start() config = Config().come_from_env() - chat_data = ChatData(100, render_chat_log_message) + chat_data = ChatData(100) return { "socket": socket, "log_socket": log_socket, @@ -155,7 +156,6 @@ def display_forward_configuration(): def display_api_key_configuration(): - with st.expander("See explanation"): st.write( """ @@ -184,9 +184,12 @@ def display_api_key_configuration(): # check openai models: # from openai import OpenAI - # client = OpenAI(api_key=) + # from rich import print + # client = OpenAI(api_key="sk-") # openai_model_list = [i.id for i in client.models.list()] # openai_model_list.sort() + # print(openai_model_list) + # print(len(openai_model_list)) openai_model_list = [ 'babbage-002', @@ -194,6 +197,7 @@ def display_api_key_configuration(): 'dall-e-3', 'davinci-002', 'gpt-3.5-turbo', + 'gpt-3.5-turbo-0125', 'gpt-3.5-turbo-0301', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-1106', @@ -205,8 +209,13 @@ def display_api_key_configuration(): 'gpt-4-0125-preview', 'gpt-4-0613', 'gpt-4-1106-preview', + 'gpt-4-1106-vision-preview', + 'gpt-4-turbo', + 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo-preview', 'gpt-4-vision-preview', + 'gpt-4o', + 'gpt-4o-2024-05-13', 'text-embedding-3-large', 'text-embedding-3-small', 'text-embedding-ada-002', @@ -354,7 +363,7 @@ def display_rate_limit_configuration(): st.subheader("Token Rate Limit") token_rate_limit_df = pd.DataFrame( - [i.to_dict() for i in rate_limit.token_rate_limit] + [i.to_dict_str() for i in rate_limit.token_rate_limit] ) edited_token_rate_limit_df = st.data_editor( token_rate_limit_df, @@ -365,7 +374,7 @@ def display_rate_limit_configuration(): st.subheader("Request Rate Limit") req_rate_limit_df = pd.DataFrame( - [i.to_dict() for i in rate_limit.req_rate_limit] + [i.to_dict_str() for i in rate_limit.req_rate_limit] ) edited_req_rate_limit_df = st.data_editor( req_rate_limit_df, @@ -380,12 +389,12 @@ def display_rate_limit_configuration(): rate_limit.strategy = strategy rate_limit.token_rate_limit = [ - RateLimitType(row["route"], row["value"]) + RateLimitType(row["route"], ast.literal_eval(row["value"])) for _, row in edited_token_rate_limit_df.iterrows() ] rate_limit.req_rate_limit = [ - RateLimitType(row["route"], row["value"]) + RateLimitType(row["route"], ast.literal_eval(row["value"])) for _, row in edited_req_rate_limit_df.iterrows() ] @@ -432,23 +441,23 @@ def display_other_configuration(): elif selected_section == "Real-time Logs": st.write("### Real-time Logs") st.write("\n") + render_with_markdown = st.toggle("Using Markdown to render", value=True) + st.write("\n") - with st.container(): - - q = st.session_state['q'] - chat_data: ChatData = st.session_state['chat_data'] - chat_data.render_all_messages() - while True: - uid, msg = q.get() - uid: bytes - print(f"{uid=}") - item = orjson.loads(msg) - if uid.startswith(b"0"): - item['user_role'] = True - else: - item['assistant_role'] = True - - chat_data.add_message(item) + q = st.session_state['q'] + chat_data: ChatData = st.session_state['chat_data'] + chat_data.render_all_messages(markdown=render_with_markdown) + while True: + uid, msg = q.get() + uid: bytes + item = orjson.loads(msg) + item['uid'] = uid[1:].decode() + if uid.startswith(b"0"): + item['user_role'] = True + else: + item['assistant_role'] = True + + chat_data.add_message(item, markdown=render_with_markdown) elif selected_section == "Playground": st.write("## todo") From a8735c1813fcffac74ec2a71046c5d30af7f002b Mon Sep 17 00:00:00 2001 From: "K.Y" Date: Thu, 16 May 2024 00:44:48 +0800 Subject: [PATCH 3/4] chore: Minor updates --- .github/workflows/docs.yml | 49 ---------------------- Examples/tool_calls.py | 84 ++++++++++++++++++++++++++++++++++++++ README.md | 4 +- README_EN.md | 4 +- 4 files changed, 86 insertions(+), 55 deletions(-) delete mode 100644 .github/workflows/docs.yml create mode 100644 Examples/tool_calls.py diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml deleted file mode 100644 index 417d0c4..0000000 --- a/.github/workflows/docs.yml +++ /dev/null @@ -1,49 +0,0 @@ -name: website - -# build the documentation whenever there are new commits on main -on: - push: - branches: - - main - # Alternative: only build for tags. - # tags: - # - '*' - -# security: restrict permissions for CI jobs. -permissions: - contents: read - -jobs: - # Build the documentation and upload the static HTML files as an artifact. - build: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - # ADJUST THIS: install all dependencies (including pdoc) - - run: pip install -e . - # ADJUST THIS: build your documentation into docs/. - # We use a custom build script for pdoc itself, ideally you just run `pdoc -o docs/ ...` here. - - run: python docs/make.py - - - uses: actions/upload-pages-artifact@v3 - with: - path: docs/ - - # Deploy the artifact to GitHub pages. - # This is a separate job so that only actions/deploy-pages has the necessary permissions. - deploy: - needs: build - runs-on: ubuntu-latest - permissions: - pages: write - id-token: write - environment: - name: github-pages - url: ${{ steps.deployment.outputs.page_url }} - steps: - - id: deployment - uses: actions/deploy-pages@v4 \ No newline at end of file diff --git a/Examples/tool_calls.py b/Examples/tool_calls.py new file mode 100644 index 0000000..673c3af --- /dev/null +++ b/Examples/tool_calls.py @@ -0,0 +1,84 @@ +from openai import OpenAI +from rich import print +from sparrow import MeasureTime, yaml_load # pip install sparrow-python + +config = yaml_load("config.yaml", rel_path=True) +print(f"{config=}") + +client = OpenAI( + api_key=config['api_key'], + base_url=config['api_base'], +) +stream = True + +n = 1 + +# debug = True +debug = False + +caching = True + +max_tokens = None + +model = "gpt-3.5-turbo" +# model="gpt-4" + +mt = MeasureTime().start() +tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, + } +] +resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": "What's the weather like in Boston today?"}], + tools=tools, + tool_choice="auto", # auto is default, but we'll be explicit + stream=stream, + extra_body={"caching": caching}, +) + +if stream: + if debug: + for chunk in resp: + print(chunk) + else: + for idx, chunk in enumerate(resp): + chunk_message = chunk.choices[0].delta or "" + if idx == 0: + mt.show_interval("tcp time:") + function = chunk_message.tool_calls[0].function + name = function.name + print(f"{chunk_message.role}: \n{name}: ") + continue + + content = "" + tool_calls = chunk_message.tool_calls + if tool_calls: + function = tool_calls[0].function + if function: + content = function.arguments or "" + print(content, end="") + print() +else: + print(resp) + assistant_content = resp.choices[0].message.content + print(assistant_content) + print(resp.usage) + +mt.show_interval("tool_calls") diff --git a/README.md b/README.md index 0ae1127..68f722d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ **简体中文** | [**English**](https://github.com/KenyonY/openai-forward/blob/main/README_EN.md)

- 🌠 OpenAI Forward + OpenAI Forward

@@ -26,8 +26,6 @@
-[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/KenyonY/openai-forward) - [特点](#主要特性) | [部署指南](deploy.md) | [使用指南](#使用指南) | diff --git a/README_EN.md b/README_EN.md index 3ae708c..11ca3be 100644 --- a/README_EN.md +++ b/README_EN.md @@ -1,7 +1,7 @@ **English** | [**简体中文**](https://github.com/KenyonY/openai-forward/blob/main/README.md)

- 🌠 OpenAI Forward + OpenAI Forward

@@ -27,8 +27,6 @@
-[![Deploy to Render](https://render.com/images/deploy-to-render-button.svg)](https://render.com/deploy?repo=https://github.com/KenyonY/openai-forward) - [Features](#Key-Features) | [Deployment Guide](deploy_en.md) | From 10eceb676e65c18ab7bc63f0b4407141d6c0cd8c Mon Sep 17 00:00:00 2001 From: "K.Y" Date: Thu, 16 May 2024 00:51:14 +0800 Subject: [PATCH 4/4] Version upgrade --- openai_forward/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openai_forward/__init__.py b/openai_forward/__init__.py index 039ea32..a7ea4d0 100644 --- a/openai_forward/__init__.py +++ b/openai_forward/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.8.0-alpha" +__version__ = "0.8.0" from dotenv import load_dotenv from yaml import load