Conversation
b443113 to
0d6e4b3
Compare
0d6e4b3 to
1838c9e
Compare
|
|
||
| public static function create(): self | ||
| { | ||
| $raw_chars = file_get_contents(dirname(__FILE__) . "/../data/characters.json"); |
There was a problem hiding this comment.
Would it make sense to have the filepaths passed as parameters?
| public static function create(): self | ||
| { | ||
| $raw_chars = file_get_contents(dirname(__FILE__) . "/../data/characters.json"); | ||
| $byte_encoder = json_decode($raw_chars, true); |
There was a problem hiding this comment.
Might be convenient to have JSON_THROW_ON_ERROR as the fourth parameter
| "minimum-stability": "stable", | ||
| "require": {} | ||
| "require": { | ||
| "php": ">7.4" |
There was a problem hiding this comment.
Any particular reason to go with such an old PHP version? 7.4 is past its EOL. I would suggest just going with 8.0 here (security patches stopping in November of this year), if not 8.1 or 8.2
| }, | ||
| "minimum-stability": "stable", | ||
| "require": {} | ||
| "require": { |
| } | ||
| return -1; | ||
| } | ||
| use CodeRevolutionPlugins\GPT3Encoder\Encoder; |
There was a problem hiding this comment.
This whole file would serve better as an example in a README
|
|
||
| public function encode(string $text): array | ||
| { | ||
| $bpe_tokens = array(); |
There was a problem hiding this comment.
| $bpe_tokens = array(); | |
| $bpe_tokens = []; |
| preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches); | ||
| if(!isset($matches[0]) || count($matches[0]) == 0) | ||
| { | ||
| error_log('Failed to match string: ' . $text); |
There was a problem hiding this comment.
I think it makes sense to throw a custom RuntimeException here and let the caller determine how to handle it.
| return $bpe_tokens; | ||
| } | ||
|
|
||
| $cache = array(); |
There was a problem hiding this comment.
| $cache = array(); | |
| $cache = []; |
| $new_tokens = array(); | ||
| $chars = array(); |
There was a problem hiding this comment.
| $new_tokens = array(); | |
| $chars = array(); | |
| $new_tokens = $chars = []; |
|
|
||
| private function split($str, $len = 1) | ||
| { | ||
| $arr = []; |
I am not planning on working on this MR anymore. I am using https://github.com/Gioni06/GPT3Tokenizer now |
Thanks for the tip! 👍 |
The PHP alternative is discontinued, its author is using the project to which this patch points to, which is also available in the PHP package manager. See: CodeRevolutionPlugins/GPT-3-Encoder-PHP#1 (comment)
You should also add the package to https://packagist.org