patterncsharpMinor
Guessing a file type based on its content
Viewed 0 times
filetypeitsbasedguessingcontent
Problem
In a project I'm working on, I need a component that is able to make a best guess about file's type based on its contents (a
I opened up Bing and Googled around a bit and found this gem on magic numbers. I figured it would probably do the job well enough so decided to roll a limited implementation in C#.
Most of the time it's a straightforward "starts with" check. However, some formats (I'm looking at you JPEG and PDF) make life slightly more difficult.
E.g. JPEG starts with
EDIT The second byte in a jpeg is actually 0xD8 - I obviously had a copy paste error here
PDFs can have their header anywhere within the first 1024 bytes which is why it's not always a "starts with" operation.
FileType.cs
FileTypeChecker.cs
```
public class FileTypeChecker
{
private static readonly IList kn
Stream). It doesn't need to do a particularly great job - basically just need to guess a file extension (which will later be mapped to a mime type). I can't trust any metadata - I can only act based on the content.I opened up Bing and Googled around a bit and found this gem on magic numbers. I figured it would probably do the job well enough so decided to roll a limited implementation in C#.
Most of the time it's a straightforward "starts with" check. However, some formats (I'm looking at you JPEG and PDF) make life slightly more difficult.
E.g. JPEG starts with
0xFF, 0xD, 0xFF, 0xE0, xx, xx, 0x4A, 0x46, 0x49, 0x46, 0x00 where xx can be anything (I think).EDIT The second byte in a jpeg is actually 0xD8 - I obviously had a copy paste error here
PDFs can have their header anywhere within the first 1024 bytes which is why it's not always a "starts with" operation.
FileType.cs
public class FileType
{
private static readonly FileType unknown = new FileType("unknown", string.Empty, new byte?[0]);
private readonly string name;
private readonly string extension;
private readonly byte?[] magicSequence;
private readonly int maximumStartIndex;
public string Name { get { return name; } }
public string Extension { get { return extension; } }
public byte?[] MagicSequence { get { return magicSequence; } }
public int MaximumStartLocation { get { return maximumStartIndex; } }
public static FileType Unknown { get { return unknown; } }
public FileType(string name, string extension, byte?[] magicSequence, int maximumStartIndex = 0)
{
this.name = name;
this.extension = extension;
this.magicSequence = magicSequence;
this.maximumStartIndex = maximumStartIndex;
}
}FileTypeChecker.cs
```
public class FileTypeChecker
{
private static readonly IList kn
Solution
The more I think about it,
Here's one possible solution. Create an abstract class (or interface)
Then we need a few classes that derive from this.
(I've been a bit fast and loose with this, there might be bugs present.)
With all that in place, we can rewrite
And finally
So what has all this done for us? For one, I think the code is more composable and amenable to change. But in practical terms, I set up
StartOfFileContainsFileType is doing too many things: exact matching, fuzzy matching, and matching within a range.Here's one possible solution. Create an abstract class (or interface)
FileTypeMatcher. I went with an abstract class just to keep all the validation in one place.public abstract class FileTypeMatcher
{
public bool Matches(Stream stream, bool resetPosition = true)
{
if (stream == null)
{
throw new ArgumentNullException("stream");
}
if (!stream.CanRead || (stream.Position != 0 && !stream.CanSeek))
{
throw new ArgumentException("File contents must be a readable stream", "stream");
}
if (stream.Position != 0 && resetPosition)
{
stream.Position = 0;
}
return MatchesPrivate(stream);
}
protected abstract bool MatchesPrivate(Stream stream);
}Then we need a few classes that derive from this.
public class ExactFileTypeMatcher : FileTypeMatcher
{
private readonly byte[] bytes;
public ExactFileTypeMatcher(IEnumerable bytes)
{
this.bytes = bytes.ToArray();
}
protected override bool MatchesPrivate(Stream stream)
{
foreach (var b in this.bytes)
{
if (stream.ReadByte() != b)
{
return false;
}
}
return true;
}
}public class FuzzyFileTypeMatcher : FileTypeMatcher
{
private readonly byte?[] bytes;
public FuzzyFileTypeMatcher(IEnumerable bytes)
{
this.bytes = bytes.ToArray();
}
protected override bool MatchesPrivate(Stream stream)
{
foreach (var b in this.bytes)
{
var c = stream.ReadByte();
if (c == -1 || (b.HasValue && c != b.Value))
{
return false;
}
}
return true;
}
}public class RangeFileTypeMatcher : FileTypeMatcher
{
private readonly FileTypeMatcher matcher;
private readonly int maximumStartLocation;
public RangeFileTypeMatcher(FileTypeMatcher matcher, int maximumStartLocation)
{
this.matcher = matcher;
this.maximumStartLocation = maximumStartLocation;
}
protected override bool MatchesPrivate(Stream stream)
{
for (var i = 0; i = stream.Length.
stream.Position = i;
if (matcher.Matches(stream, resetPosition: false))
{
return true;
}
}
return false;
}
}(I've been a bit fast and loose with this, there might be bugs present.)
With all that in place, we can rewrite
FileType to delegate to FileTypeMatcherpublic class FileType
{
private static readonly FileType unknown = new FileType("unknown", string.Empty, null);
private readonly string name;
private readonly string extension;
private readonly FileTypeMatcher fileTypeMatcher;
public string Name { get { return name; } }
public string Extension { get { return extension; } }
public static FileType Unknown { get { return unknown; } }
public FileType(string name, string extension, FileTypeMatcher matcher)
{
this.name = name;
this.extension = extension;
this.fileTypeMatcher = matcher;
}
public bool Matches(Stream stream)
{
return this.fileTypeMatcher == null || this.fileTypeMatcher.Matches(stream);
}
}And finally
FileTypeChecker to delegate to FileTypepublic class FileTypeChecker
{
private static readonly IList knownFileTypes = new List
{
new FileType("Bitmap", ".bmp", new ExactFileTypeMatcher(new byte[] {0x42, 0x4d})),
new FileType("Portable Network Graphic", ".png",
new ExactFileTypeMatcher(new byte[] {0x89, 0x50, 0x4E, 0x47, 0x0D, 0x0A, 0x1A, 0x0A})),
new FileType("JPEG", ".jpg",
new FuzzyFileTypeMatcher(new byte?[] {0xFF, 0xD, 0xFF, 0xE0, null, null, 0x4A, 0x46, 0x49, 0x46, 0x00})),
new FileType("Graphics Interchange Format 87a", ".gif",
new ExactFileTypeMatcher(new byte[] {0x47, 0x49, 0x46, 0x38, 0x37, 0x61})),
new FileType("Graphics Interchange Format 89a", ".gif",
new ExactFileTypeMatcher(new byte[] {0x47, 0x49, 0x46, 0x38, 0x39, 0x61})),
new FileType("Portable Document Format", ".pdf", new RangeFileTypeMatcher(new ExactFileTypeMatcher(new byte[] { 0x25, 0x50, 0x44, 0x46 }), 1019))
// ... Potentially more in future
};
public FileType GetFileType(Stream fileContent)
{
return GetFileTypes(fileContent).FirstOrDefault() ?? FileType.Unknown;
}
public IEnumerable GetFileTypes(Stream stream)
{
return knownFileTypes.Where(fileType => fileType.Matches(stream));
}
}So what has all this done for us? For one, I think the code is more composable and amenable to change. But in practical terms, I set up
Code Snippets
public abstract class FileTypeMatcher
{
public bool Matches(Stream stream, bool resetPosition = true)
{
if (stream == null)
{
throw new ArgumentNullException("stream");
}
if (!stream.CanRead || (stream.Position != 0 && !stream.CanSeek))
{
throw new ArgumentException("File contents must be a readable stream", "stream");
}
if (stream.Position != 0 && resetPosition)
{
stream.Position = 0;
}
return MatchesPrivate(stream);
}
protected abstract bool MatchesPrivate(Stream stream);
}public class ExactFileTypeMatcher : FileTypeMatcher
{
private readonly byte[] bytes;
public ExactFileTypeMatcher(IEnumerable<byte> bytes)
{
this.bytes = bytes.ToArray();
}
protected override bool MatchesPrivate(Stream stream)
{
foreach (var b in this.bytes)
{
if (stream.ReadByte() != b)
{
return false;
}
}
return true;
}
}public class FuzzyFileTypeMatcher : FileTypeMatcher
{
private readonly byte?[] bytes;
public FuzzyFileTypeMatcher(IEnumerable<byte?> bytes)
{
this.bytes = bytes.ToArray();
}
protected override bool MatchesPrivate(Stream stream)
{
foreach (var b in this.bytes)
{
var c = stream.ReadByte();
if (c == -1 || (b.HasValue && c != b.Value))
{
return false;
}
}
return true;
}
}public class RangeFileTypeMatcher : FileTypeMatcher
{
private readonly FileTypeMatcher matcher;
private readonly int maximumStartLocation;
public RangeFileTypeMatcher(FileTypeMatcher matcher, int maximumStartLocation)
{
this.matcher = matcher;
this.maximumStartLocation = maximumStartLocation;
}
protected override bool MatchesPrivate(Stream stream)
{
for (var i = 0; i < this.maximumStartLocation; i++)
{
// Might want to check if i >= stream.Length.
stream.Position = i;
if (matcher.Matches(stream, resetPosition: false))
{
return true;
}
}
return false;
}
}public class FileType
{
private static readonly FileType unknown = new FileType("unknown", string.Empty, null);
private readonly string name;
private readonly string extension;
private readonly FileTypeMatcher fileTypeMatcher;
public string Name { get { return name; } }
public string Extension { get { return extension; } }
public static FileType Unknown { get { return unknown; } }
public FileType(string name, string extension, FileTypeMatcher matcher)
{
this.name = name;
this.extension = extension;
this.fileTypeMatcher = matcher;
}
public bool Matches(Stream stream)
{
return this.fileTypeMatcher == null || this.fileTypeMatcher.Matches(stream);
}
}Context
StackExchange Code Review Q#85054, answer score: 9
Revisions (0)
No revisions yet.