diff --git a/.gitignore b/.gitignore index e71a9f7..de49506 100644 --- a/.gitignore +++ b/.gitignore @@ -40,3 +40,5 @@ UnityFileSystemTestData/UserSettings/ UnityFileSystemTestData/Packages/ *.db *.csv + +*.stackdump diff --git a/Analyzer/AnalyzerTool.cs b/Analyzer/AnalyzerTool.cs index 4a8efe6..6bc6903 100644 --- a/Analyzer/AnalyzerTool.cs +++ b/Analyzer/AnalyzerTool.cs @@ -26,6 +26,7 @@ public int Analyze( string databaseName, string searchPattern, bool skipReferences, + bool skipCrc, bool verbose, bool noRecursion) { @@ -40,6 +41,7 @@ public int Analyze( { parser.Verbose = verbose; parser.SkipReferences = skipReferences; + parser.SkipCrc = skipCrc; parser.Init(writer.Connection); } diff --git a/Analyzer/PPtrAndCrcProcessor.cs b/Analyzer/PPtrAndCrcProcessor.cs index a9d5b13..ee3e1e2 100644 --- a/Analyzer/PPtrAndCrcProcessor.cs +++ b/Analyzer/PPtrAndCrcProcessor.cs @@ -7,31 +7,71 @@ namespace UnityDataTools.Analyzer; -// This class is used to extract all the PPtrs in a serialized object. It executes a callback whenever a PPtr is found. -// It provides a string representing the property path of the property (e.g. "m_MyObject.m_MyArray[2].m_PPtrProperty"). +// Walks the TypeTree of a serialized object to do two things in a single pass: +// 1. Extract every PPtr (object reference). A callback is executed for each one, receiving the +// property path that leads to it (e.g. "m_MyObject.m_MyArray[2].m_PPtrProperty"). +// 2. Accumulate a CRC32 over the object's serialized bytes, including the content of external +// streams (texture/mesh/audio data stored in companion .resS/.resource files). This CRC is a +// content fingerprint used to detect whether two objects are identical. +// NOTE: references contribute their resolved analyzer object id (see ExtractPPtr), so the CRC +// is only comparable within a single analyze database, not between separate runs - see issue #74. +// CRC computation can be disabled (skipCrc) while still extracting references. public class PPtrAndCrcProcessor : IDisposable { + // Invoked for each PPtr (object reference) found while walking an object. + // objectId - analyzer/database id of the object that contains the reference (the source) + // fileId - PPtr m_FileID: index into the file's external-reference table; 0 means this (local) file + // pathId - PPtr m_PathID: the referenced object's local file id (LFID) within that file + // propertyPath - dotted path to the reference, e.g. "m_MyObject.m_MyArray[2].m_PPtrProperty" + // propertyType - the referenced type, e.g. "Texture2D" + // Returns the analyzer/database id of the referenced object (same id space as objectId), which the + // caller folds into the CRC. public delegate int CallbackDelegate(long objectId, int fileId, long pathId, string propertyPath, string propertyType); - private SerializedFile m_SerializedFile; - private UnityFileReader m_Reader; - private long m_Offset; - private long m_ObjectId; - private uint m_Crc32; - private string m_Folder; - private StringBuilder m_StringBuilder = new(); - private byte[] m_pptrBytes = new byte[4]; - - private CallbackDelegate m_Callback; - + // Content-addressed stream paths (new ContentDirectory build output) look like + // "cah:/". The hash already identifies the content, so the path itself is + // folded into the CRC instead of opening the (differently named) resource file. + // Matched case-insensitively since the scheme casing is not guaranteed. + private const string ContentAddressedPrefix = "cah:/"; + + // Configuration shared across all objects, set once in the constructor. + private SerializedFile m_SerializedFile; // file being analyzed; used to resolve referenced managed type trees + private UnityFileReader m_Reader; // reader over the serialized file holding the object data + private string m_Folder; // directory of the serialized file; used to find companion resource files + private bool m_SkipCrc; // when true, skip CRC computation (references are still extracted) + private CallbackDelegate m_Callback; // invoked for each PPtr; returns the referenced object's id + + // Readers for external resource (.resS/.resource) files, opened on demand, reused across + // objects, and disposed in Dispose(). private Dictionary m_resourceReaders = new(); - public PPtrAndCrcProcessor(SerializedFile serializedFile, UnityFileReader reader, string folder, + // Reusable scratch buffers, kept as fields to avoid allocating per object/property. + private StringBuilder m_StringBuilder = new(); // builds the current property path during the walk + private byte[] m_pptrBytes = new byte[4]; // holds a referenced object id while feeding it to the CRC + + // State for the object currently being processed, (re)initialized by each Process() call. + private long m_Offset; // current read position within m_Reader + private long m_ObjectId; // analyzer id of the object being processed, passed to the callback + private uint m_Crc32; // CRC accumulated so far for this object + + // serializedFile: the file whose objects are analyzed (used to resolve referenced managed types). + // reader: reader over that file's bytes; Process() walks each object through it. + // folder: directory containing the serialized file; companion .resS/.resource files are + // looked up here when a non-content-addressed external stream contributes to the CRC. + // skipCrc: when true, the tree is still walked to emit references but no CRC is computed. + // callback: called for every PPtr found; its return value (the referenced object's id) is + // folded into the CRC. + public PPtrAndCrcProcessor( + SerializedFile serializedFile, + UnityFileReader reader, + string folder, + bool skipCrc, CallbackDelegate callback) { m_SerializedFile = serializedFile; m_Reader = reader; m_Folder = folder; + m_SkipCrc = skipCrc; m_Callback = callback; } @@ -45,40 +85,9 @@ public void Dispose() m_resourceReaders.Clear(); } - private UnityFileReader GetResourceReader(string filename) - { - var slashPos = filename.LastIndexOf('/'); - if (slashPos > 0) - { - filename = filename.Remove(0, slashPos + 1); - } - - if (!m_resourceReaders.TryGetValue(filename, out var reader)) - { - try - { - reader = new UnityFileReader("archive:/" + filename, 4 * 1024 * 1024); - } - catch (Exception) - { - try - { - reader = new UnityFileReader(Path.Join(m_Folder, filename), 4 * 1024 * 1024); - } - catch (Exception) - { - Console.Error.WriteLine(); - Console.Error.WriteLine($"Error opening resource file {filename}"); - reader = null; - } - } - - m_resourceReaders[filename] = reader; - } - - return reader; - } - + // Walks the serialized object rooted at `node`, whose data starts at `offset` in the reader, + // emitting every PPtr through the callback. Returns a CRC32 fingerprint of the object's content + // (0 when CRC is disabled). `objectId` is the analyzer id of this object, forwarded to the callback. public uint Process(long objectId, long offset, TypeTreeNode node) { m_Offset = offset; @@ -99,7 +108,7 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) { if (node.IsBasicType) { - m_Crc32 = m_Reader.ComputeCRC(m_Offset, node.Size, m_Crc32); + AppendCrc(m_Offset, node.Size); m_Offset += node.Size; } else if (node.IsArray) @@ -108,10 +117,12 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) } else if (node.Type == "vector" || node.Type == "map" || node.Type == "staticvector") { + // These containers wrap an Array node as their single child; process that array. ProcessArray(node.Children[0], false, isInManagedReferenceRegistry); } else if (node.Type.StartsWith("PPtr<")) { + // Extract T from the "PPtr" type string. var startIndex = node.Type.IndexOf('<') + 1; var endIndex = node.Type.Length - 1; var referencedType = node.Type.Substring(startIndex, endIndex - startIndex); @@ -120,12 +131,15 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) } else if (node.Type == "StreamingInfo") { + // StreamingInfo (Texture2D/Mesh) points at external stream data: offset, size, path. if (node.Children.Count != 3) throw new Exception("Invalid StreamingInfo"); + // The offset field is 32- or 64-bit depending on the type tree version. var offset = node.Children[0].Size == 4 ? m_Reader.ReadInt32(m_Offset) : m_Reader.ReadInt64(m_Offset); m_Offset += node.Children[0].Size; + // size is an unsigned 32-bit field read as a signed int; streams >2GB are not handled. var size = m_Reader.ReadInt32(m_Offset); m_Offset += 4; @@ -136,16 +150,13 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) if (size > 0) { - var resourceFile = GetResourceReader(filename); - - if (resourceFile != null) - { - m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32); - } + AppendStreamCrc(offset, size, filename); } } else if (node.Type == "StreamedResource") { + // Like StreamingInfo but used by AudioClip/VideoClip; the fields are in a different + // order - path first, then 64-bit offset and size. if (node.Children.Count != 3) throw new Exception("Invalid StreamedResource"); @@ -157,28 +168,28 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) var offset = m_Reader.ReadInt64(m_Offset); m_Offset += 8; + // 64-bit size truncated to int; streams >2GB are not handled. var size = (int)m_Reader.ReadInt64(m_Offset); m_Offset += 8; if (size > 0) { - var resourceFile = GetResourceReader(filename); - - if (resourceFile != null) - { - m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32); - } + AppendStreamCrc(offset, size, filename); } } else if (node.CSharpType == typeof(string)) { + // A string is serialized as a 4-byte length followed by its bytes; CRC the whole span. var prevOffset = m_Offset; m_Offset += m_Reader.ReadInt32(m_Offset) + 4; - m_Crc32 = m_Reader.ComputeCRC(prevOffset, (int)(m_Offset - prevOffset), m_Crc32); + AppendCrc(prevOffset, (int)(m_Offset - prevOffset)); } else if (node.IsManagedReferenceRegistry) { - // ManagedReferenceRegistry are never nested + // The registry holds this object's [SerializeReference] instances (see + // ProcessManagedReferenceRegistry). It only appears at the top level of the object; + // the guard prevents re-entering it when we are already walking referenced-object + // data through another type tree (isInManagedReferenceRegistry == true). if (!isInManagedReferenceRegistry) ProcessManagedReferenceRegistry(node); } @@ -194,6 +205,8 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) } } + // Unity pads certain fields to a 4-byte boundary. Re-align after the node if it, or any of + // its children, is flagged to align. if ( ((int)node.MetaFlags & (int)TypeTreeMetaFlags.AlignBytes) != 0 || ((int)node.MetaFlags & (int)TypeTreeMetaFlags.AnyChildUsesAlignBytes) != 0 @@ -205,17 +218,20 @@ private void ProcessNode(TypeTreeNode node, bool isInManagedReferenceRegistry) private void ProcessArray(TypeTreeNode node, bool isManagedReferenceRegistry, bool isInManagedReferenceRegistry) { + // An Array node has two children: [0] is the int element count, [1] the element template. var dataNode = node.Children[1]; if (dataNode.IsBasicType) { + // Fixed-size elements are stored contiguously, so CRC the 4-byte count plus all element + // bytes in one range. (size * count can overflow int for very large arrays.) var arraySize = m_Reader.ReadInt32(m_Offset); - m_Crc32 = m_Reader.ComputeCRC(m_Offset, dataNode.Size * arraySize + 4, m_Crc32); + AppendCrc(m_Offset, dataNode.Size * arraySize + 4); m_Offset += dataNode.Size * arraySize + 4; } else { - m_Crc32 = m_Reader.ComputeCRC(m_Offset, 4, m_Crc32); + AppendCrc(m_Offset, 4); var arraySize = m_Reader.ReadInt32(m_Offset); m_Offset += 4; @@ -234,20 +250,64 @@ private void ProcessArray(TypeTreeNode node, bool isManagedReferenceRegistry, bo } else { + // This is the version-2 "RefIds" array. Each element is a ReferencedObject + // whose children are [rid, type, data]; read the rid here and pass the type + // node to ProcessManagedReferenceData (the data node isn't needed - the layout + // comes from the referenced type's own TypeTree). if (dataNode.Children.Count < 3) throw new Exception("Invalid ReferencedObject"); - // First child is rid. long rid = m_Reader.ReadInt64(m_Offset); - m_Crc32 = m_Reader.ComputeCRC(m_Offset, 8, m_Crc32); + AppendCrc(m_Offset, 8); m_Offset += 8; - ProcessManagedReferenceData(dataNode.Children[1], dataNode.Children[2], rid); + ProcessManagedReferenceData(dataNode.Children[1], rid); } } } } + // A ManagedReferenceRegistry holds the [SerializeReference] instances owned by this object. + // In YAML/JSON it is the "references:" section that always appears at the end of a + // MonoBehaviour/ScriptableObject. Each instance is stored here exactly once; the fields that + // point at it (elsewhere in the object) only store its "rid", so shared instances and cycles + // collapse to the same rid. + // + // Given this C# source: + // + // [Serializable] public class MyClass { public string m_string; } + // + // public class MyScriptableObject : ScriptableObject + // { + // [SerializeReference] public MyClass m_refA, m_refB, m_refC; // m_refC assigned m_refB + // } + // + // the serialized layout looks like this (YAML shown; the binary we walk has the same shape): + // + // m_refA: { rid: 4862042034409046192 } + // m_refB: { rid: 4862042034409046193 } + // m_refC: { rid: 4862042034409046193 } // shared instance -> same rid as m_refB + // references: + // version: 2 + // RefIds: + // - rid: 4862042034409046192 + // type: { class: MyClass, ns: , asm: MyAssembly } + // data: { m_string: foo } + // - rid: 4862042034409046193 + // type: { class: MyClass, ns: , asm: MyAssembly } + // data: { m_string: bar } + // + // The complication: TypeTrees cannot express polymorphism, so the layout of each "data" block + // is NOT described by this object's own TypeTree. Each RefId entry names its concrete type + // (class/namespace/assembly), and the "data" bytes follow a SEPARATE TypeTree obtained via + // SerializedFile.GetRefTypeTypeTreeRoot(...). Walking the registry therefore means jumping into + // a different TypeTree for every entry (see ProcessManagedReferenceData) - which is exactly why + // finding references inside the registry is so much more involved than for the rest of the object. + // + // Two on-disk versions exist: + // version 1 - entries stored back to back and terminated by a sentinel type (see + // ProcessManagedReferenceData); the rid is implied by position. + // version 2 - entries stored as a "RefIds" array, each element carrying its own rid. private void ProcessManagedReferenceRegistry(TypeTreeNode node) { if (node.Children.Count < 2) @@ -255,19 +315,19 @@ private void ProcessManagedReferenceRegistry(TypeTreeNode node) // First child is version number. var version = m_Reader.ReadInt32(m_Offset); - m_Crc32 = m_Reader.ComputeCRC(m_Offset, node.Children[0].Size, m_Crc32); + AppendCrc(m_Offset, node.Children[0].Size); m_Offset += node.Children[0].Size; if (version == 1) { - // Second child is the ReferencedObject. + // Second child is the ReferencedObject; its first child describes the referenced type. var refObjNode = node.Children[1]; - // And its children are the referenced type and data nodes. var refTypeNode = refObjNode.Children[0]; - var refObjData = refObjNode.Children[1]; + // Read entries until ProcessManagedReferenceData hits the sentinel; here the rid is + // simply the entry's position. int i = 0; - while (ProcessManagedReferenceData(refTypeNode, refObjData, i++)) + while (ProcessManagedReferenceData(refTypeNode, i++)) { } } @@ -295,38 +355,48 @@ private void ProcessManagedReferenceRegistry(TypeTreeNode node) } } - bool ProcessManagedReferenceData(TypeTreeNode refTypeNode, TypeTreeNode referencedTypeDataNode, long rid) + // Reads one registry entry: the concrete type's fully-qualified name (class, namespace, + // assembly) followed by the object's data. The data is laid out according to that type's own + // TypeTree, which we look up by name and recurse into - so the data node from the registry's + // own TypeTree is not needed here; refTypeNode is used only to sanity-check the entry's shape. + // Returns false at the end of a version-1 registry - marked either by the "Terminus" sentinel + // type or by a null/unknown rid (-1 / -2) - and true otherwise. + bool ProcessManagedReferenceData(TypeTreeNode refTypeNode, long rid) { if (refTypeNode.Children.Count < 3) throw new Exception("Invalid ReferencedManagedType"); + // The type's fully-qualified name is stored as three consecutive strings: class, namespace, + // then assembly. Each is a length-prefixed string, padded to a 4-byte boundary. var stringSize = m_Reader.ReadInt32(m_Offset); - m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32); + AppendCrc(m_Offset, stringSize + 4); var className = m_Reader.ReadString(m_Offset + 4, stringSize); m_Offset += stringSize + 4; m_Offset = (m_Offset + 3) & ~(3); stringSize = m_Reader.ReadInt32(m_Offset); - m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32); + AppendCrc(m_Offset, stringSize + 4); var namespaceName = m_Reader.ReadString(m_Offset + 4, stringSize); m_Offset += stringSize + 4; m_Offset = (m_Offset + 3) & ~(3); stringSize = m_Reader.ReadInt32(m_Offset); - m_Crc32 = m_Reader.ComputeCRC(m_Offset, (int)(m_Offset + stringSize + 4), m_Crc32); + AppendCrc(m_Offset, stringSize + 4); var assemblyName = m_Reader.ReadString(m_Offset + 4, stringSize); m_Offset += stringSize + 4; m_Offset = (m_Offset + 3) & ~(3); + // Sentinel that terminates a version-1 registry, plus the null/unknown rids. if ((className == "Terminus" && namespaceName == "UnityEngine.DMAT" && assemblyName == "FAKE_ASM") || rid == -1 || rid == -2) { return false; } + // The data block follows the referenced type's own TypeTree, not this object's, so look it + // up by FQN and walk it (isInManagedReferenceRegistry = true so we don't re-enter the registry). var refTypeTypeTree = m_SerializedFile.GetRefTypeTypeTreeRoot(className, namespaceName, assemblyName); - // Process the ReferencedObject using its own TypeTree. var size = m_StringBuilder.Length; m_StringBuilder.Append("rid("); m_StringBuilder.Append(rid); @@ -347,11 +417,86 @@ private void ExtractPPtr(string referencedType) if (fileId != 0 || pathId != 0) { var refId = m_Callback(m_ObjectId, fileId, pathId, m_StringBuilder.ToString(), referencedType); - m_pptrBytes[0] = (byte)(refId >> 24); - m_pptrBytes[1] = (byte)(refId >> 16); - m_pptrBytes[2] = (byte)(refId >> 8); - m_pptrBytes[3] = (byte)(refId); - m_Crc32 = Crc32Algorithm.Append(m_Crc32, m_pptrBytes); + + // The CRC folds in the resolved analyzer object id rather than the raw PPtr + // (fileId/pathId). This normalizes references so duplicate objects in different bundles + // hash the same within a database, but it makes the CRC depend on per-run id assignment, + // so CRCs are not comparable between separate databases. See issue #74. + if (!m_SkipCrc) + { + m_pptrBytes[0] = (byte)(refId >> 24); + m_pptrBytes[1] = (byte)(refId >> 16); + m_pptrBytes[2] = (byte)(refId >> 8); + m_pptrBytes[3] = (byte)(refId); + m_Crc32 = Crc32Algorithm.Append(m_Crc32, m_pptrBytes); + } } } + + // Extends the CRC with a range of the main serialized file, unless CRC is disabled. + private void AppendCrc(long offset, int size) + { + if (!m_SkipCrc) + m_Crc32 = m_Reader.ComputeCRC(offset, size, m_Crc32); + } + + // Extends the CRC with the content of an external stream segment (StreamingInfo / + // StreamedResource), unless CRC is disabled. Content-addressed paths fold in the path + // string; other paths read the actual bytes from the companion resource file. + private void AppendStreamCrc(long offset, int size, string path) + { + if (m_SkipCrc) + return; + + // A cah:/ stream always references the entire resource file: the hash in the path + // is the hash of the whole file, so the path uniquely identifies the bytes and we + // fold it into the CRC rather than reading them. The offset/size fields only exist + // for backward compatibility with the older output format that packed multiple + // resources into one file; ContentDirectory builds never do this (offset is 0 and + // size is the full file), which is why ignoring offset/size here is correct. + if (path.StartsWith(ContentAddressedPrefix, StringComparison.OrdinalIgnoreCase)) + { + m_Crc32 = Crc32Algorithm.Append(m_Crc32, Encoding.UTF8.GetBytes(path)); + return; + } + + var resourceFile = GetResourceReader(path); + if (resourceFile != null) + m_Crc32 = resourceFile.ComputeCRC(offset, size, m_Crc32); + } + + private UnityFileReader GetResourceReader(string filename) + { + var slashPos = filename.LastIndexOf('/'); + if (slashPos > 0) + { + filename = filename.Remove(0, slashPos + 1); + } + + if (!m_resourceReaders.TryGetValue(filename, out var reader)) + { + try + { + reader = new UnityFileReader("archive:/" + filename, 4 * 1024 * 1024); + } + catch (Exception) + { + try + { + reader = new UnityFileReader(Path.Join(m_Folder, filename), 4 * 1024 * 1024); + } + catch (Exception) + { + Console.Error.WriteLine(); + Console.Error.WriteLine($"Error opening resource file {filename}"); + reader = null; + } + } + + m_resourceReaders[filename] = reader; + } + + return reader; + } + } diff --git a/Analyzer/SQLite/Handlers/ISQLiteHandler.cs b/Analyzer/SQLite/Handlers/ISQLiteHandler.cs index 147e15e..2026d56 100644 --- a/Analyzer/SQLite/Handlers/ISQLiteHandler.cs +++ b/Analyzer/SQLite/Handlers/ISQLiteHandler.cs @@ -29,4 +29,5 @@ public interface ISQLiteFileParser : IDisposable void Parse(string filename); public bool Verbose { get; set; } public bool SkipReferences { get; set; } + public bool SkipCrc { get; set; } } diff --git a/Analyzer/SQLite/Parsers/AddressablesBuildLayoutParser.cs b/Analyzer/SQLite/Parsers/AddressablesBuildLayoutParser.cs index 4ac13e6..4941dae 100644 --- a/Analyzer/SQLite/Parsers/AddressablesBuildLayoutParser.cs +++ b/Analyzer/SQLite/Parsers/AddressablesBuildLayoutParser.cs @@ -15,6 +15,7 @@ public class AddressablesBuildLayoutParser : ISQLiteFileParser public bool Verbose { get; set; } public bool SkipReferences { get; set; } + public bool SkipCrc { get; set; } public void Dispose() { diff --git a/Analyzer/SQLite/Parsers/SerializedFileParser.cs b/Analyzer/SQLite/Parsers/SerializedFileParser.cs index dcb0128..c2d232c 100644 --- a/Analyzer/SQLite/Parsers/SerializedFileParser.cs +++ b/Analyzer/SQLite/Parsers/SerializedFileParser.cs @@ -15,6 +15,7 @@ public class SerializedFileParser : ISQLiteFileParser public bool Verbose { get; set; } public bool SkipReferences { get; set; } + public bool SkipCrc { get; set; } public bool CanParse(string filename) { @@ -36,7 +37,7 @@ public void Dispose() public void Init(SqliteConnection db) { - m_Writer = new SerializedFileSQLiteWriter(db, SkipReferences); + m_Writer = new SerializedFileSQLiteWriter(db, SkipReferences, SkipCrc); } public void Parse(string filename) diff --git a/Analyzer/SQLite/Writers/SerializedFileSQLiteWriter.cs b/Analyzer/SQLite/Writers/SerializedFileSQLiteWriter.cs index f91bcd4..fe15ab1 100644 --- a/Analyzer/SQLite/Writers/SerializedFileSQLiteWriter.cs +++ b/Analyzer/SQLite/Writers/SerializedFileSQLiteWriter.cs @@ -19,6 +19,7 @@ public class SerializedFileSQLiteWriter : IDisposable private int m_NextAssetBundleId = 0; private bool m_SkipReferences; + private bool m_SkipCrc; private IdProvider m_SerializedFileIdProvider = new(); private ObjectIdProvider m_ObjectIdProvider = new(); @@ -54,11 +55,12 @@ public class SerializedFileSQLiteWriter : IDisposable private SqliteConnection m_Database; private SqliteCommand m_LastId = new SqliteCommand(); private SqliteTransaction m_CurrentTransaction = null; - public SerializedFileSQLiteWriter(SqliteConnection database, bool skipReferences) + public SerializedFileSQLiteWriter(SqliteConnection database, bool skipReferences, bool skipCrc) { m_Initialized = false; m_Database = database; m_SkipReferences = skipReferences; + m_SkipCrc = skipCrc; } public void Init() @@ -116,7 +118,7 @@ public void WriteSerializedFile(string relativePath, string fullPath, string con { using var sf = UnityFileSystem.OpenSerializedFile(fullPath); using var reader = new UnityFileReader(fullPath, 64 * 1024 * 1024); - using var pptrReader = new PPtrAndCrcProcessor(sf, reader, containingFolder, AddReference); + using var pptrReader = new PPtrAndCrcProcessor(sf, reader, containingFolder, m_SkipCrc, AddReference); int serializedFileId = m_SerializedFileIdProvider.GetId(Path.GetFileName(fullPath).ToLower()); int sceneId = -1; @@ -228,7 +230,10 @@ public void WriteSerializedFile(string relativePath, string fullPath, string con m_AddObjectCommand.SetValue("game_object", ""); } - if (!m_SkipReferences) + // The walk both extracts references and accumulates the CRC, so it is needed + // unless both are disabled. When CRC is on but references are off, the walk + // still resolves referenced object ids (AddReference skips the insert). + if (!m_SkipReferences || !m_SkipCrc) { crc32 = pptrReader.Process(currentObjectId, offset, root); } @@ -264,15 +269,23 @@ public void WriteSerializedFile(string relativePath, string fullPath, string con } } + // Callback from PPtrAndCrcProcessor for each reference discovered in the SerializedFile private int AddReference(long objectId, int fileId, long pathId, string propertyPath, string propertyType) { + // Always resolve the id so the CRC stays stable; only persist the row when references + // are being extracted. var referencedObjectId = m_ObjectIdProvider.GetId((m_LocalToDbFileId[fileId], pathId)); - m_AddReferenceCommand.SetTransaction(m_CurrentTransaction); - m_AddReferenceCommand.SetValue("object", objectId); - m_AddReferenceCommand.SetValue("referenced_object", referencedObjectId); - m_AddReferenceCommand.SetValue("property_path", propertyPath); - m_AddReferenceCommand.SetValue("property_type", propertyType); - m_AddReferenceCommand.ExecuteNonQuery(); + + if (!m_SkipReferences) + { + m_AddReferenceCommand.SetTransaction(m_CurrentTransaction); + m_AddReferenceCommand.SetValue("object", objectId); + m_AddReferenceCommand.SetValue("referenced_object", referencedObjectId); + m_AddReferenceCommand.SetValue("property_path", propertyPath); + m_AddReferenceCommand.SetValue("property_type", propertyType); + m_AddReferenceCommand.ExecuteNonQuery(); + } + return referencedObjectId; } diff --git a/Documentation/analyzer.md b/Documentation/analyzer.md index 2e95ed3..dc6056e 100644 --- a/Documentation/analyzer.md +++ b/Documentation/analyzer.md @@ -47,7 +47,7 @@ case, Unity will include the asset in all the AssetBundles with a reference to i view_potential_duplicates provides the number of instances and the total size of the potentially duplicated assets. It also lists all the AssetBundles where the asset was found. -If the skipReferences option is used, there will be a lot of false positives in that view. Otherwise, +If the `--skip-crc` option is used, there will be a lot of false positives in that view. Otherwise, it should be very accurate because CRCs are used to determine if objects are identical. ## asset_view (AssetBundleProcessor) diff --git a/Documentation/command-analyze.md b/Documentation/command-analyze.md index 78692d8..b4a08f7 100644 --- a/Documentation/command-analyze.md +++ b/Documentation/command-analyze.md @@ -13,7 +13,8 @@ UnityDataTool analyze [options] | `` | Path to folder containing files to analyze | *(required)* | | `-o, --output-file ` | Output database filename | `database.db` | | `-p, --search-pattern ` | File search pattern (`*` and `?` supported) | `*` | -| `-s, --skip-references` | Skip CRC and reference extraction (faster, smaller DB) | `false` | +| `-s, --skip-references` | Do not extract references (smaller DB, no `refs` table). CRC is still computed. | `false` | +| `--skip-crc` | Skip the CRC32 checksum calculation (faster; `objects.crc32` will be 0) | `false` | | `-v, --verbose` | Show more information during analysis | `false` | | `--no-recurse` | Do not recurse into sub-directories | `false` | | `-d, --typetree-data ` | Load an external TypeTree data file before processing (Unity 6.5+) | — | @@ -30,9 +31,9 @@ Analyze only `.bundle` files and specify a custom database name: UnityDataTool analyze /path/to/asset/bundles -o my_database.db -p "*.bundle" ``` -Fast analysis (skip reference tracking): +Fastest analysis (skip both reference extraction and CRC): ```bash -UnityDataTool analyze /path/to/bundles -s +UnityDataTool analyze /path/to/bundles --skip-references --skip-crc ``` See also [Analyze Examples](../../Documentation/analyze-examples.md). @@ -121,23 +122,27 @@ See [Comparing Builds](../../Documentation/comparing-builds.md) for strategies t ### Slow Analyze times, large output database -Consider using the `--skip-references` argument. +Two independent flags reduce analyze time and database size: -A real life analyze of a big Addressables build shows how large a difference this can make: +* `--skip-crc` skips the CRC32 calculation. This is usually the largest time saver, because computing a CRC requires reading the full content of every object, including large texture, mesh and audio data in companion `.resS`/`.resource` files. +* `--skip-references` skips reference extraction, which is the largest contributor to database size (the `refs` table). The references are not needed for core asset inventory and size information. -* 208 seconds and producted a 500MB database (not specifying --skip-reference) -* 9 seconds and produced a 68 MB file (with --skip-reference) +For the fastest, smallest result, combine them. -The references are not needed for core asset inventory and size information. +A real life analyze of a big Addressables build, skipping both references and CRC, shows how large a difference this can make: -Note: When specifying `--skip-reference` some functionality is lost: +* 208 seconds and produced a 500MB database (default) +* 9 seconds and produced a 68 MB file (with `--skip-references --skip-crc`) + +When `--skip-references` is used, some functionality is lost: * the `find-refs` command will not work * `view_material_shader_refs` and `view_material_texture_refs` will be empty +* `script_object_view` will be empty * Queries that look at the relationship between objects will not work. For example the refs table is required to link between a `MonoBehaviour` and its `MonoScript`. -* The `objects.crc32` column will be NULL/0 for all objects. This means: - * No detection of identical objects by content hash (See [Comparing Builds](../../Documentation/comparing-builds.md)) - * The `view_potential_duplicates` view relies partially on CRC32 to distinguish true duplicates -Future work: The refs table contains a lot of repeated strings and could be made smaller and more efficient. It might also be prudent to control the CRC32 calculation using an independent flag. +When `--skip-crc` is used, the `objects.crc32` column will be 0 for all objects. This means: + +* No detection of identical objects by content hash (See [Comparing Builds](../../Documentation/comparing-builds.md)) +* The `view_potential_duplicates` view relies partially on CRC32 to distinguish true duplicates diff --git a/UnityDataTool/Program.cs b/UnityDataTool/Program.cs index 2d5d591..9a0ef04 100644 --- a/UnityDataTool/Program.cs +++ b/UnityDataTool/Program.cs @@ -1,5 +1,6 @@ using System; using System.CommandLine; +using System.CommandLine.Invocation; using System.IO; using System.Threading.Tasks; using UnityDataTools.Analyzer; @@ -41,7 +42,8 @@ static Command BuildAnalyzeCommand() { var pathArg = new Argument("path", "The path to the directory containing the files to analyze").ExistingOnly(); var oOpt = new Option(aliases: new[] { "--output-file", "-o" }, description: "Filename of the output database", getDefaultValue: () => "database.db"); - var sOpt = new Option(aliases: new[] { "--skip-references", "-s" }, description: "Skip CRC and do not extract references"); + var sOpt = new Option(aliases: new[] { "--skip-references", "-s" }, description: "Do not extract references (CRC is still computed unless --skip-crc is also given)"); + var scOpt = new Option(aliases: new[] { "--skip-crc" }, description: "Skip CRC checksum calculation"); var rOpt = new Option(aliases: new[] { "--extract-references", "-r" }) { IsHidden = true }; var pOpt = new Option(aliases: new[] { "--search-pattern", "-p" }, description: "File search pattern", getDefaultValue: () => "*"); var vOpt = new Option(aliases: new[] { "--verbose", "-v" }, description: "Verbose output"); @@ -53,6 +55,7 @@ static Command BuildAnalyzeCommand() pathArg, oOpt, sOpt, + scOpt, rOpt, pOpt, vOpt, @@ -61,14 +64,28 @@ static Command BuildAnalyzeCommand() }; analyzeCommand.AddAlias("analyse"); - analyzeCommand.SetHandler( - (DirectoryInfo di, string o, bool s, bool r, string p, bool v, bool noRecurse, FileInfo d) => + // Bound via InvocationContext because the option count exceeds the strongly-typed + // SetHandler overloads. + analyzeCommand.SetHandler((InvocationContext context) => + { + var d = context.ParseResult.GetValueForOption(dOpt); + var ttResult = LoadTypeTreeDataFile(d); + if (ttResult != 0) { - var ttResult = LoadTypeTreeDataFile(d); - if (ttResult != 0) return Task.FromResult(ttResult); - return Task.FromResult(HandleAnalyze(di, o, s, r, p, v, noRecurse)); - }, - pathArg, oOpt, sOpt, rOpt, pOpt, vOpt, recurseOpt, dOpt); + context.ExitCode = ttResult; + return; + } + + context.ExitCode = HandleAnalyze( + context.ParseResult.GetValueForArgument(pathArg), + context.ParseResult.GetValueForOption(oOpt), + context.ParseResult.GetValueForOption(sOpt), + context.ParseResult.GetValueForOption(scOpt), + context.ParseResult.GetValueForOption(rOpt), + context.ParseResult.GetValueForOption(pOpt), + context.ParseResult.GetValueForOption(vOpt), + context.ParseResult.GetValueForOption(recurseOpt)); + }); return analyzeCommand; } @@ -293,6 +310,7 @@ static int HandleAnalyze( DirectoryInfo path, string outputFile, bool skipReferences, + bool skipCrc, bool extractReferences, string searchPattern, bool verbose, @@ -305,7 +323,7 @@ static int HandleAnalyze( Console.WriteLine("WARNING: --extract-references, -r option is deprecated (references are now extracted by default)"); } - return analyzer.Analyze(path.FullName, outputFile, searchPattern, skipReferences, verbose, noRecurse); + return analyzer.Analyze(path.FullName, outputFile, searchPattern, skipReferences, skipCrc, verbose, noRecurse); } static int HandleFindReferences(FileInfo databasePath, string outputFile, long? objectId, string objectName, string objectType, bool findAll) diff --git a/UnityFileSystem.Tests/UnityFileSystemTests.cs b/UnityFileSystem.Tests/UnityFileSystemTests.cs index 47fd784..7001a91 100644 --- a/UnityFileSystem.Tests/UnityFileSystemTests.cs +++ b/UnityFileSystem.Tests/UnityFileSystemTests.cs @@ -244,6 +244,25 @@ public void ReadFile_InvalidHandle_ThrowsException() Assert.Throws(() => file.Read(10, new byte[10])); } + // Ranges that cross the internal buffer boundary (and a partial final chunk) must + // produce the same CRC as a single-buffer read. TextFile.txt is 21 bytes; an 8-byte + // buffer forces three chunks (8 + 8 + 5). + [TestCase(0, 21)] // whole file, partial final chunk + [TestCase(0, 16)] // exact multiple of the buffer size + [TestCase(3, 15)] // unaligned start, crosses two boundaries + [TestCase(0, 8)] // exactly one buffer + [TestCase(2, 5)] // entirely within one buffer + public void ComputeCRC_RangeCrossingBuffer_MatchesSingleBufferRead(long offset, int size) + { + var path = Path.Combine(Context.TestDataFolder, "TextFile.txt"); + + using var singleBufferReader = new UnityFileReader(path, 1024 * 1024); + var expected = singleBufferReader.ComputeCRC(offset, size); + + using var smallBufferReader = new UnityFileReader(path, 8); + Assert.AreEqual(expected, smallBufferReader.ComputeCRC(offset, size)); + } + [Test] public void OpenFile_ArchiveFileSystem_ReturnsFile() { diff --git a/UnityFileSystem/UnityFileReader.cs b/UnityFileSystem/UnityFileReader.cs index bf46145..73221be 100644 --- a/UnityFileSystem/UnityFileReader.cs +++ b/UnityFileSystem/UnityFileReader.cs @@ -117,16 +117,18 @@ public byte ReadUInt8(long fileOffset) return m_Buffer[offset]; } + // Computes the CRC32 over a contiguous range, reading the file in buffer-sized chunks. public uint ComputeCRC(long fileOffset, int size, uint crc32 = 0) { - var readSize = size > m_Buffer.Length ? m_Buffer.Length : size; - var readBytes = 0; + var remaining = size; - while (readBytes < size) + while (remaining > 0) { - var offset = GetBufferOffset(fileOffset, readSize); - crc32 = Crc32Algorithm.Append(crc32, m_Buffer, offset, readSize); - readBytes += readSize; + var chunk = (int)Math.Min(m_Buffer.Length, remaining); + var offset = GetBufferOffset(fileOffset, chunk); + crc32 = Crc32Algorithm.Append(crc32, m_Buffer, offset, chunk); + fileOffset += chunk; + remaining -= chunk; } return crc32;