计算Git文件blob SHA(处理表情符号等)

我正在使用JavaScript API将文件推送到GitHub。我希望避免耗尽API调用（朝向速率限制）如果GitHub上的文件未更改。点击树API以获取文件的SHA似乎不计入速率限制，所以我认为一个好的策略是（a）计算将使用JavaScript上传的文件的sha，以及（b）将其与GitHub上现有文件的sha进行比较。此策略适用于纯文本文件。但当涉及表情符号时，计算会导致不匹配。
我有一个简单的repo here。
我创建了一个contents/read-write（仅限此repo）personal-access-token，您可以使用它直接运行代码（例如从JavaScript控制台窗口）。
控制台输出的结果是一个简单的字母“a”文件，其sha为“2 e65 efe 2a 145 dda 7 ee 51 d1741299 f848 e5 bf 752 e”，这是GitHub和JavaScript报告的。在我的测试中，它也适用于包含换行符等的json文件。但是一旦涉及emoji，sha就不再匹配。
如果你有关于如何在涉及表情符号/特殊字符时本地计算sha的想法，我将不胜感激。谢谢！

var _repo = 'sospike';
var _owner = 'vsjc91';

var _lettera = 'a';
var _emoji = '🌻';

// I created a contents/read-write (this repo only) personal-access-token that you can use:
var _auth = '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';

// stackoverflow.com/questions/73419876/javascript-replace-all-emoji-in-a-string-to-unicode-and-back
// also read re "deprecated" but "ok for this use case"
// stackoverflow.com/questions/30631927/converting-to-base64-in-javascript-without-deprecated-escape-call
const encodeTextEmojis = function(plainText) {
  return unescape(encodeURIComponent(plainText));
}

const decodeBase64Emojis = function(b64) {
  return decodeURIComponent(escape(b64));
}

// stackoverflow.com/questions/73419876/javascript-replace-all-emoji-in-a-string-to-unicode-and-back
const toBase64 = function(str, adjustEmojis) {
  if (adjustEmojis == true) {
    str = encodeTextEmojis(str);
  }
  return btoa(str);
}

const fromBase64 = function(b64, adjustEmojis) {
  if (adjustEmojis == true) {
    b64 = decodeBase64Emojis(b64);
  }
  return atob(b64);
}

const buildHeaders = function() {
  return {
        Accept: 'application/vnd.github+json',
        Authorization: `Bearer ${_auth}`
      }
}

const tryGetFileResponse = async function(url, headers) {
  const request = { method: "GET", headers: headers };
  try {
    return await fetch(url, request);
  }
  catch {
    // it's ok if the file isn't there
    return null;
  }
}

const getExistingSha = async function(fileName) {
  const url = `https://api.github.com/repos/${_owner}/${_repo}/git/trees/main:?cacheBreaker=${Date.now()}`;
  const headers = buildHeaders();
  const response = await tryGetFileResponse(url, headers);
  if (!response) { return null; }
  const tree = await response.json();
  if (!tree || !tree.tree) { return null; }
  const files = tree.tree;
  
  const file = files.find(function(f) {
    return f.path && f.path.toLowerCase() == fileName.toLowerCase();
  });
  
  if (!file) { return null; }
  console.log('github reported length for ' + fileName + ' of ' + file.size);
  return file.sha;
}

const upload = async function(path, content) {
  const commitMsg = 'testing';
  const encodedContent = toBase64(content, true);

  const existingSha = await getExistingSha(path, content);
  console.log('existing sha: ' + existingSha);

  const putFileBody = { message: commitMsg, content: encodedContent };
  if (existingSha) {
    putFileBody.sha = existingSha;
  }

  await (await fetch(
    `https://api.github.com/repos/${_owner}/${_repo}/contents/${path}`,
    {
      method: 'PUT',
      headers: buildHeaders(),
      body: JSON.stringify(putFileBody),
    }
  )).json();
}

/****************************************/
// calculate sha from javascript
/****************************************/

const utf8ByteLen = function(str) {
  if (!str || str.length == 0) { return 0; }
  const inputBytes = new TextEncoder().encode(str);
  return inputBytes.length;
}
  
// stackoverflow.com/a/40031979/9014097
const buf2hex = function (buffer) { // buffer is an ArrayBuffer
  return Array.prototype.map.call(new Uint8Array(buffer), x => ('00' + x.toString(16)).slice(-2)).join('');
}
  
// stackoverflow.com/questions/63736585/why-does-crypto-subtle-digest-return-an-empty-object
const calcSha1 = async function(str) {
  if (!str || str.length == 0) { return null; }
  const inputBytes = new TextEncoder().encode(str);
  const hashBytes = await window.crypto.subtle.digest('SHA-1', inputBytes);
  const hashedStr = buf2hex(hashBytes);
  return hashedStr;
}
  
// stackoverflow.com/questions/7225313/how-does-git-compute-file-hashes?rq=3
const calcGithubTextContentSha = async function(text) {
  const adjText = encodeTextEmojis(text);
  const len = utf8ByteLen(text);
  const data = `blob ${len}\0${adjText}`;
  const sha = calcSha1(data);
  return sha;
}

/****************************************/
// run the test
/****************************************/

console.log('A text file created manually with just "a" at github.com has:')
const sha_manual = await getExistingSha('plain_typed.txt');
console.log('that sha is: ' + sha_manual);

console.log('Upload a file containing just "a" to github via code and fetch back its sha:')
await upload('plain.txt', _lettera);
const sha = await getExistingSha('plain.txt');
console.log('pushed "a" file with sha: ' + sha);

console.log('Compare vs. calculate from javascript');
const sha_js = await calcGithubTextContentSha(_lettera);
console.log('js sha for "a" file: ' + sha_js);

console.log('Now try an emoji character');
await upload('emoji.txt', _emoji);
const sha_emoji = await getExistingSha('emoji.txt');
console.log('github emoji sha: ' + sha_emoji);

const sha_emoji_js = await calcGithubTextContentSha(_emoji);
console.log('js emoji sha: ' + sha_emoji_js);

字符串
（一个可能的线索？-我注意到，文件大小只包含字母“a”是1或2，这取决于它是使用GitHub.com手动编辑还是使用此代码推送）。
下面是日志输出：

A text file created manually with just "a" at github.com has:
github reported length for plain_typed.txt of 2
that sha is: 78981922613b2afb6025042ff6bd878ac1994e85

Upload a file containing just "a" to github via code and fetch back its sha:
github reported length for plain.txt of 1
existing sha: 2e65efe2a145dda7ee51d1741299f848e5bf752e
github reported length for plain.txt of 1
pushed "a" file with sha: 2e65efe2a145dda7ee51d1741299f848e5bf752e

Compare vs. calculate from javascript
js sha for "a" file: 2e65efe2a145dda7ee51d1741299f848e5bf752e

Now try an emoji character
github reported length for emoji.txt of 4
existing sha: 9ad8dd6d25e074eca9e19e06458bb9e7a314a875
github reported length for emoji.txt of 4
github emoji sha: 9ad8dd6d25e074eca9e19e06458bb9e7a314a875
js emoji sha: 553045c77ac300d4907c43bf0de77326ec610474

型

有几个明显的问题：
encodeTextEmojis和decodeBase64Emojis函数分别使用unescape和escape。不建议使用这些函数来处理像表情符号这样的多字节字符。此外，这些方法已被弃用。
由于toBase64和fromBase64函数调用了上述编码和解码函数，这可能会导致对表情符号的错误处理。
在calcGithubTextContentSha函数中，调用了encodeTextEmojis函数，这可能会改变文本，导致SHA-1哈希值与预期不同。
GitHub报告的手动创建的文件与以编程方式上传的文件的文件大小差异可能是编码差异的标志，尽管它不会直接反映在代码中。它也可能是在一个环境中添加的eol（行尾字符），而不是另一个环境。
使用TextEncoder更安全：
注意事项：这是一个文件的SHA1，而不是你在GitHub上需要的SHA1，它代表了提交的SHA1，并且包含了比文件内容更多的内容，但是：

提交的源树（它将分解为所有子树和blob）
父提交sha1
作者info
提交者信息（可以不同！）
提交信息

但是，如果我将emoji测试文本更改为'tests'，那么上传时似乎需要转义/取消转义（对于我使用的计算代码）。
在上传时没有转义/取消转义，我创建了一个带有“tests%20%F0%9F%8C%BB”的文件。我在“Converting to Base64 in JavaScript without Deprecated 'Escape' call“中得到了一些安慰。或者你对上传有更好的建议？
您在上传带有表情符号和空格的文本时遇到的编码问题是由于在此过程中的某个地方发生了URL编码。当您看到%20替换空格，而%F0%9F%8C%BB代表表情符号时，这是URL编码在起作用。
在JavaScript中，fetch API沿着和btoa function的Base64编码应该正确处理UTF-8字符，而不需要对它们进行转义/取消转义。但是，如果您发现URL编码正在发生并改变了您的文本，您可能需要在上传之前将其解码回原始形式。
您链接的堆栈溢出帖子建议使用unescape和encodeURIComponent来解决问题，但unescape是一个不推荐使用的函数。
同样，使用TextEncoder、TextDecoder和fetch应该更安全：

// Function to encode text to Base64, updated to handle potential overflows for large files.
const textToBase64 = function(text) {
    let encoder = new TextEncoder();
    let data = encoder.encode(text);
    let base64 = btoa([].reduce.call(new Uint8Array(data), function(p, c) {
        return p + String.fromCharCode(c);
    }, ''));
    return base64;
};

// Function to decode Base64 to text
const base64ToText = function(base64) {
  let binaryString = atob(base64);
  let bytes = new Uint8Array(binaryString.length);
  for (let i = 0; i < binaryString.length; i++) {
    bytes[i] = binaryString.charCodeAt(i);
  }
  let decoder = new TextDecoder();
  return decoder.decode(bytes);
};

// Your upload function
const upload = async function(path, content) {
  const commitMsg = 'testing';
  const encodedContent = textToBase64(content);  // using the new textToBase64 function

  const existingSha = await getExistingSha(path, content);
  console.log('existing sha: ' + existingSha);

  const putFileBody = { message: commitMsg, content: encodedContent };
  if (existingSha) {
    putFileBody.sha = existingSha;
  }

  await (await fetch(
    `https://api.github.com/repos/${_owner}/${_repo}/contents/${path}`,
    {
      method: 'PUT',
      headers: buildHeaders(),
      body: JSON.stringify(putFileBody),
    }
  )).json();
};

// Rest of your code

字符串
textToBase64函数已更新，以便在将文本转换为Base64时正确处理UTF-8编码。
正如the OP Vince在注解中以及laggingreflex的注解中所指出的，在更新的textToBase64函数中，btoa(String.fromCharCode(...new Uint8Array(data)))行已被btoa([].reduce.call(new Uint8Array(data), function(p, c) { return p + String.fromCharCode(c); }, ''))替换。该更改旨在通过以避免潜在溢出问题的方式构建字符串来处理更大的文件。
base64ToText函数用于处理Base64到文本的解码。
upload函数现在使用textToBase64将内容编码为Base64，这应该可以正确处理表情符号和其他特殊字符，而无需URL编码。
当你使用upload函数上传包含表情符号和空格的文本时，它应该不会再在GitHub上生成URL编码的文本。

计算Git文件blob SHA(处理表情符号等)

1条答案

相关问题

热门标签

最新问答