conf.bib

@comment{{This file has been generated by bib2bib 1.96}}
@comment{{Command line: ./bib2bib -ob conf.bib -c '$type = "INPROCEEDINGS"' -s year -r mediamill.bib}}
@inproceedings{JainCVPR14,
  author = {Mihir Jain and Jan C. van Gemert and Herv\'e J\'egou and Patrick Bouthemy and Cees G. M. Snoek},
  title = {Action Localization by Tubelets from Motion},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
  month = {June},
  year = {2014},
  pages = {},
  address = {Columbus, Ohio, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/jain-tubelets-cvpr2014.pdf},
  abstract = {
  
  		This paper considers the problem of action localization,
		where the objective is to determine when and where certain
		actions appear. We introduce a sampling strategy to produce
		2D+t sequences of bounding boxes, called tubelets.
		Compared to state-of-the-art alternatives, this drastically
		reduces the number of hypotheses that are likely to include
		the action of interest. Our method is inspired by a recent
		technique introduced in the context of image localization.
		Beyond considering this technique for the first time for
		videos, we revisit this strategy for 2D+t sequences obtained
		from super-voxels. Our sampling strategy advantageously
		exploits a criterion that reflects how action related motion
		deviates from background motion.
		We demonstrate the interest of our approach by extensive
		experiments on two public datasets: UCF Sports and MSR-II.
		Our approach significantly outperforms the state-of-theart
		on both datasets, while restricting the search of actions
		to a fraction of possible bounding box sequences.
  		 
  		}
}
@inproceedings{MensinkCVPR14,
  author = {Thomas Mensink and Efstratios Gavves and Cees G. M. Snoek},
  title = {COSTA: Co-Occurrence Statistics for Zero-Shot Classification},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
  month = {June},
  year = {2014},
  pages = {},
  address = {Columbus, Ohio, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/mensink-co-occurrence-cvpr2014.pdf},
  abstract = {
  		
  		In this paper we aim for zero-shot classification, that is 
  		visual recognition of an unseen class by using knowledge 
  		transfer from known classes. Our main contribution is COSTA, 
  		which exploits co-occurrences of visual concepts in images 
  		for knowledge transfer. These inter-dependencies arise 
  		naturally between concepts, and are easy to obtain from 
  		existing annotations or web-search hit counts. We estimate 
  		a classifier for a new label, as a weighted combination of 
  		related classes, using the co-occurrences to define the 
  		weight. We propose various metrics to leverage these 
  		co-occurrences, and a regression model for learning a weight 
  		for each related class. We also show that our zero-shot 
  		classifiers can serve as priors for few-shot learning. 
  		Experiments on three multi-labeled datasets reveal that our 
  		proposed zero-shot methods, are approaching and occasionally 
  		outperforming fully supervised SVMs. We conclude that 
  		co-occurrence statistics suffice for zero-shot classification.
  		 
  		}
}
@inproceedings{SandeCVPR14,
  author = {Koen E. A. van de Sande and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Fisher and VLAD with FLAIR},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
  month = {June},
  year = {2014},
  pages = {},
  address = {Columbus, Ohio, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-flair-cvpr2014.pdf},
  abstract = {
  
  		A major computational bottleneck in many current algorithms
  		is the evaluation of arbitrary boxes. Dense local
  		analysis and powerful bag-of-word encodings, such
  		as Fisher vectors and VLAD, lead to improved accuracy
  		at the expense of increased computation time. Where a
  		simplification in the representation is tempting, we exploit
  		novel representations while maintaining accuracy. We start
  		from state-of-the-art, fast selective search, but our method
  		will apply to any initial box-partitioning. By representing
  		the picture as sparse integral images, one per codeword,
  		we achieve a Fast Local Area Independent Representation.
  		FLAIR allows for very fast evaluation of any box encoding
  		and still enables spatial pooling. In FLAIR we achieve exact
  		VLADs difference coding, even with l2 and power-norms.
  		Finally, by multiple codeword assignments, we achieve exact
  		and approximate Fisher vectors with FLAIR. The results
  		are a 18x speedup, which enables us to set a new state-of-the-
  		art on the challenging 2010 PASCAL VOC objects and
  		the fine-grained categorization of the CUB-2011 200 bird
  		species. Plus, we rank number one in the official ImageNet
  		2013 detection challenge.
  		 
  		}
}
@inproceedings{TaoCVPR14,
  author = {Ran Tao and Efstratios Gavves and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Locality in Generic Instance Search from One Example},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
  month = {June},
  year = {2014},
  pages = {},
  address = {Columbus, Ohio, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/tao-locality-cvpr2014.pdf},
  abstract = {
  

  		 
  		}
}
@inproceedings{vanHoutICASSP14,
  author = {Julien van Hout and Eric Yeh and Dennis Koelma Cees G. M. Snoek and Chen Sun and Ramakant Nevatia and Julie Wong and Gregory Myers},
  title = {Late Fusion and Calibration for Multimedia Event Detection Using Few Examples},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  month = {May},
  year = {2014},
  pages = {},
  address = {Florence, Italy},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/hout-fusion-calibration-icassp2014.pdf},
  abstract = {
  
  		The state-of-the-art in example-based multimedia event detection
  		(MED) rests on heterogeneous classifiers whose scores are typically
  		combined in a late-fusion scheme. Recent studies on this topic have
  		failed to reach a clear consensus as to whether machine learning
  		techniques can outperform rule-based fusion schemes with varying
  		amount of training data. In this paper, we present two parametric
  		approaches to late fusion: a normalization scheme for arithmetic
  		mean fusion (logistic averaging) and a fusion scheme based on logistic
  		regression, and compare them to widely used rule-based fusion
  		schemes. We also describe how logistic regression can be used to
  		calibrate the fused detection scores to predict an optimal threshold
  		given a detection prior and costs on errors. We discuss the advantages
  		and shortcomings of each approach when the amount of positives
  		available for training varies from 10 positives (10Ex) to 100
  		positives (100Ex). Experiments were run using video data from the
  		NIST TRECVID MED 2013 evaluation and results were reported in
  		terms of a ranking metric: the mean average precision (mAP) and
  		R0, a cost-based metric introduced in TRECVID MED 2013.
  		 
  		}
}
@inproceedings{HabibianICMR14long,
  author = {Amirhossein Habibian and Thomas Mensink and Cees G. M. Snoek},
  title = {Composite Concept Discovery for Zero-Shot Video Event Detection},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2014},
  pages = {},
  address = {Glasgow, UK},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/habibian-composite-icmr14.pdf},
  abstract = {
  
  		We consider automated detection of events in video without
  		the use of any visual training examples. A common
  		approach is to represent videos as classification scores obtained
  		from a vocabulary of pre-trained concept classifiers.
  		Where others construct the vocabulary by training individual
  		concept classifiers, we propose to train classifiers for
  		combination of concepts composed by Boolean logic operators.
  		We call these concept combinations composite concepts
  		and contribute an algorithm that automatically discovers
  		them from existing video-level concept annotations.
  		We discover composite concepts by jointly optimizing the
  		accuracy of concept classifiers and their effectiveness for detecting
  		events. We demonstrate that by combining concepts
  		into composite concepts, we can train more accurate classifiers
  		for the concept vocabulary, which leads to improved
  		zero-shot event detection. Moreover, we demonstrate that
  		by using different logic operators, namely ?AND?, ?OR?, we
  		discover different types of composite concepts, which are
  		complementary for zero-shot event detection. We perform
  		a search for 20 events in 41K web videos from two test sets
  		of the challenging TRECVID Multimedia Event Detection
  		2013 corpus. The experiments demonstrate the superior performance
  		of the discovered composite concepts, compared to
  		present-day alternatives, for zero-shot event detection.
  		 
  		}
}
@inproceedings{HabibianICMR14short,
  author = {Amirhossein Habibian and Cees G. M. Snoek},
  title = {Stop-Frame Removal Improves Web Video Classification},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2014},
  pages = {},
  address = {Glasgow, UK},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/habibian-stopframe-icmr14.pdf},
  abstract = {
  
  		Web videos available in sharing sites like YouTube, are becoming
  		an alternative to manually annotated training data,
  		which are necessary for creating video classifiers. However,
  		when looking into web videos, we observe they contain several
  		irrelevant frames that may randomly appear in any
  		video, i.e., blank and over exposed frames. We call these irrelevant
  		frames stop-frames and propose a simple algorithm
  		to identify and exclude them during classifier training. Stop-frames
  		might appear in any video, so it is hard to recognize
  		their category. Therefore we identify stop-frames as those
  		frames, which are commonly misclassified by any concept
  		classifier. Our experiments demonstrates that using our algorithm
  		improves classification accuracy by 60% and 24%
  		in terms of mean average precision for an event and concept
  		detection benchmark.
 
  		}
}
@inproceedings{MazloomICMR14,
  author = {Masoud Mazloom and Xirong Li and Cees G. M. Snoek},
  title = {Few-Example Video Event Retrieval Using Tag Propagation},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2014},
  pages = {},
  address = {Glasgow, UK},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/mazloom-tagpropagation-icmr14.pdf},
  abstract = {
  
  		An emerging topic in multimedia retrieval is to detect a complex
  		event in video using only a handful of video examples.
  		Different from existing work which learns a ranker from positive
  		video examples and hundreds of negative examples, we
  		aim to query web video for events using zero or only a few
  		visual examples. To that end, we propose in this paper a
  		tag-based video retrieval system which propagates tags from
  		a tagged video source to an unlabeled video collection without
  		the need of any training examples. Our algorithm is
  		based on weighted frequency neighbor voting using concept
  		vector similarity. Once tags are propagated to unlabeled
  		video we can rely on off-the-shelf language models to rank
  		these videos by the tag similarity. We study the behavior
  		of our tag-based video event retrieval system by performing
  		three experiments on web videos from the TRECVID multimedia
  		event detection corpus, with zero, one and multiple
  		query examples that beats a recent alternative.
  		 
  		}
}
@inproceedings{SunICMR14,
  author = {Chen Sun and Brian Burns and Ram Nevatia and Cees G. M. Snoek and Bob Bolles and Greg Myers and Wen Wang and Eric Yeh},
  title = {ISOMER: Informative Segment Observations for Multimedia Event Recounting},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2014},
  pages = {},
  address = {Glasgow, UK},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sun-informative-segment-icmr14.pdf},
  abstract = {
  
  		This paper describes a system for multimedia event detection
		and recounting. The goal is to detect a high level event
		class in unconstrained web videos and generate event oriented
		summarization for display to users. For this purpose,
		we detect informative segments and collect observations for
		them, leading to our ISOMER system. We combine a large
		collection of both low level and semantic level visual and
		audio features for event detection. For event recounting,
		we propose a novel approach to identify event oriented discriminative
		video segments and their descriptions with a
		linear SVM event classifier. User friendly concepts including
		objects, actions, scenes, speech and optical character
		recognition are used in generating descriptions. We also develop
		several mapping and filtering strategies to cope with
		noisy concept detectors. Our system performed competitively
		in the TRECVID 2013 Multimedia Event Detection
		task with near 100,000 videos and was the highest performer
		in TRECVID 2013 Multimedia Event Recounting task.
  		 
  		}
}
@inproceedings{GavvesICCV13,
  author = {Efstratios Gavves and Basura Fernando and Cees G. M. Snoek and Arnold W. M. Smeulders and Tinne Tuytelaars},
  title = {Fine-Grained Categorization by Alignments},
  booktitle = {Proceedings of the {IEEE} International Conference on Computer Vision},
  pages = {},
  month = {December},
  year = {2013},
  address = {Sydney, Australia},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gavves-fine-grained-alignment-iccv13.pdf},
  abstract = {
  
  		The aim of this paper is fine-grained categorization without 
  		human interaction. Different from prior work, which relies on 
  		detectors for specific object parts, we propose to localize 
  		distinctive details by roughly aligning the objects using just 
  		the overall shape, since implicit to fine-grained categorization 
  		is the existence of a super-class shape shared among all classes. 
  		The alignments are then used to transfer part annotations from 
  		training images to test images (supervised alignment), or to 
  		blindly yet consistently segment the object in a number of regions 
  		(unsupervised alignment). We furthermore argue that in the distinction 
  		of fine-grained sub-categories, classification-oriented encodings 
  		like Fisher vectors are better suited for describing localized 
  		information than popular matching oriented features like HOG. 
  		We evaluate the method on the CU-2011 Birds and Stanford Dogs 
  		fine-grained datasets, outperforming the state-of-the-art.
		 
  		}
}
@inproceedings{LiICCV13,
  author = {Zhenyang Li and Efstratios Gavves and Koen E. A. van de Sande and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Codemaps Segment, Classify and Search Objects Locally},
  booktitle = {Proceedings of the {IEEE} International Conference on Computer Vision},
  pages = {},
  month = {December},
  year = {2013},
  address = {Sydney, Australia},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-codemaps-iccv13.pdf},
  abstract = {
  
  		In this paper we aim for segmentation and classification of 
  		objects. We propose codemaps that are a joint formulation of 
  		the classification score and the local neighborhood it belongs 
  		to in the image. We obtain the codemap by reordering the encoding, 
  		pooling and classification steps over lattice elements. Other 
  		than existing linear decompositions who emphasize only the 
  		efficiency benefits for localized search, we make three novel 
  		contributions. As a preliminary, we provide a theoretical 
  		generalization of the sufficient mathematical conditions under 
  		which image encodings and classification becomes locally 
  		decomposable. As first novelty we introduce l2 normalization 
  		for arbitrarily shaped image regions, which is fast enough for 
  		semantic segmentation using our Fisher codemaps. Second, using 
  		the same lattice across images, we propose kernel pooling which 
  		embeds nonlinearities into codemaps for object classification by 
  		explicit or approximate feature mappings. Results demonstrate 
  		that l2 normalized Fisher codemaps improve the state-of-the-art 
  		in semantic segmentation for PASCAL VOC. For object classification 
  		the addition of nonlinearities brings us on par with the 
  		state-of-the-art, but is 3x faster. Because of the codemaps? 
  		inherent efficiency, we can reach significant speed-ups for 
  		localized search as well. We exploit the efficiency gain for our 
  		third novelty: object segment retrieval using a single query 
  		image only.
		 
  		}
}
@inproceedings{LiACM13,
  author = {Xirong Li and Cees G. M. Snoek},
  title = {Classifying Tag Relevance with Relevant Positive and Negative Examples},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  month = {October},
  year = {2013},
  pages = {},
  address = {Barcelona, Spain},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-classifying-tag-relevance-mm2013.pdf},
  abstract = {
 
 		Image tag relevance estimation aims to automatically determine 
  	 	what people label about images is factually present in the 
  		pictorial content. Different from previous works, which either 
  		use only positive examples of a given tag or use positive 
  	 	and random negative examples, we argue the importance of 
  	 	relevant positive and relevant negative examples for tag 
  	 	relevance estimation. We propose a system that selects 
  	 	positive and negative examples, deemed most relevant with 
  	 	respect to the given tag from crowd-annotated images. While 
  		applying models for many tags could be cumbersome, our 
  		system trains efficient ensembles of Support Vector Machines 
  	 	per tag, enabling fast classification. Experiments on two 
  	 	benchmark sets show that the proposed system compares 
  	 	favorably against five present day methods. Given extracted 
  	 	visual features, for each image our system can process up 
  		to 3,787 tags per second. The new system is both effective 
  		and efficient for tag relevance estimation.
  		 
  		}
}
@inproceedings{MazloomACM13,
  author = {Masoud Mazloom and Amirhossein Habibian and Cees G. M. Snoek},
  title = {Querying for Video Events by Semantic Signatures from Few Examples},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  month = {October},
  year = {2013},
  pages = {},
  address = {Barcelona, Spain},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/mazloom-query-by-semantic-mm13.pdf},
  abstract = {
  
  		We aim to query web video for complex events using only a 
  		handful of video query examples, where the standard approach 
  		learns a ranker from hundreds of examples. We consider a 
  		semantic signature representation, consisting of off-the-shelf 
  		concept detectors, to capture the variance in semantic 
  		appearance of events. Since it is unknown what similarity 
  		metric and query fusion to use in such an event retrieval 
  		setting, we perform three experiments on unconstrained web 
  		videos from the TRECVID event detection task. It reveals 
  		that: retrieval with semantic signatures using normalized 
  		correlation as similarity metric outperforms a low-level 
  		bag-of-words alternative, multiple queries are best combined 
  		using late fusion with an average operator, and event retrieval 
  		is preferred over event classication when less than eight 
  		positive video examples are available.
  		 
  		}
}
@inproceedings{KordumovaCBMI13,
  author = {Svetlana Kordumova and Xirong Li and Cees G. M. Snoek},
  title = {Evaluating Sources and Strategies for Learning Video Concepts from Social Media},
  booktitle = {International Workshop on Content-Based Multimedia Indexing},
  month = {June},
  year = {2013},
  pages = {},
  address = {Veszpr\'em, Hungary},
  pdf = {},
  abstract = {}
}
@inproceedings{HabibianICMR13,
  author = {Amirhossein Habibian and Koen E. A. van de Sande and Cees G. M. Snoek},
  title = {Recommendations for Video Event Recognition Using Concept Vocabularies},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2013},
  pages = {89--96},
  address = {Dallas, Texas, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/habibian-vocabulary-recommendations-events-icmr2013.pdf},
  abstract = { 
  
  		Representing videos using vocabularies composed of concept detectors appears
  		promising for event recognition. While many have recently shown the benefits of
  		concept vocabularies for recognition, the important question what concepts to
  		include in the vocabulary is ignored. In this paper, we study how to create an
  		effective vocabulary for arbitrary event recognition in web video. We consider
  		four research questions related to the number, the type, the specificity and the
  		quality of the detectors in concept vocabularies. A rigorous experimental protocol
  		using a pool of 1,346 concept detectors trained on publicly available annotations,
  		a dataset containing 13,274 web videos from the Multimedia Event Detection
  		benchmark, 25 event groundtruth definitions, and a state-of-the-art event
  		recognition pipeline allow us to analyze the performance of various concept 
  		vocabulary definitions. From the analysis we arrive at the recommendation that for
  		effective event recognition the concept vocabulary should i) contain more than 200
  		concepts, ii) be diverse by covering object, action, scene, people, animal and
  		attribute concepts, iii) include both general and specific concepts, and iv)
  		increase the number of concepts rather than improve the quality of the individual
  		detectors. We consider the recommendations for video event recognition using
  		concept vocabularies the most important contribution of the paper, as they provide
  		guidelines for future work. }
}
@inproceedings{MazloomICMR13,
  author = {Masoud Mazloom and Efstratios Gavves and Koen E. A. van de Sande and Cees G. M. Snoek},
  title = {Searching Informative Concept Banks for Video Event Detection},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2013},
  pages = {255--262},
  address = {Dallas, Texas, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/mazloom-concept-banks-icmr2013.pdf},
  abstract = { 
  		
  		An emerging trend in video event detection is to learn an event from a bank of 
  		concept detector scores. Different from existing work, which simply relies on a 
  		bank containing all available detectors, we propose in this paper an algorithm 
  		that learns from examples what concepts in a bank are most informative per event. 
  		We model finding this bank of informative concepts out of a large set of concept 
  		detectors as a rare event search. Our proposed approximate solution finds the 
  		optimal concept bank using a cross-entropy optimization. We study the behavior of 
  		video event detection based on a bank of informative concepts by performing three 
  		experiments on more than 1,000 hours of arbitrary internet video from the TRECVID 
  		multimedia event detection task. Starting from a concept bank of 1,346 detectors 
  		we show that 1.) some concept banks are more informative than others for specific 
  		events, 2.) event detection using an automatically obtained informative concept 
  		bank is more robust than using all available concepts, 3.) even for small amounts 
  		of training examples an informative concept bank outperforms a full bank and a 
  		bag-of-word event representation, and 4.) we show qualitatively that the 
  		informative concept banks make sense for the events of interest, without being 
		programmed to do so. We conclude that for concept banks it pays to be informative.
		}
}
@inproceedings{ModoloSPIE13,
  author = {Davide Modolo and Cees G. M. Snoek},
  title = {Can Object Detectors Aid Internet Video Event Retrieval?},
  booktitle = {Proceedings of the IS\&T/SPIE Symposium on Electronic Imaging},
  pages = {},
  month = {February},
  year = {2013},
  address = {San Francisco, CA, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/modolo-object-event-spie2013.pdf},
  abstract = { 
  
  		The problem of event representation for automatic event detection in Internet 
  		videos is acquiring an increasing importance, due to their applicability to a 
  		large number of applications. Existing methods focus on representing events in 
  		terms of either low-level descriptors or domain-speci c models suited for a 
  		limited class of video only, ignoring the high-level meaning of the events. 
  		Ultimately aiming for a more robust and meaningful representation, in this paper 
  		we question whether object detectors can aid video event retrieval. We propose an 
  		experimental study that investigates the utility of present-day local and global 
  		object detectors for video event search. By evaluating object detectors optimized 
  		for high-quality photographs on low-quality Internet video, we establish that 
  		present-day detectors can successfully be used for recognizing objects in web 
  		videos. We use an object-based representation to re-rank the results of an 
  		appearance-based event detector. Results on the challenging TRECVID multimedia 
  		event detection corpus demonstrate that objects can indeed aid event retrieval. 
  		While much remains to be studied, we believe that our experimental study is a rst 
  		step towards revealing the potential of object-based event representations. 
  		}
}
@inproceedings{GavvesCVPR12,
  author = {Efstratios Gavves and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Convex Reduction of High-Dimensional Kernels for Visual Classification},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
  pages = {},
  month = {June},
  year = {2012},
  address = {Providence, Rhode Island, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gavves-convex-kernel-cvpr2012.pdf},
  abstract = {
  		 
  		 Limiting factors of fast and effective classifiers for large sets 
  		 of images are their dependence on the number of images analyzed 
  		 and the dimensionality of the image representation. Considering 
  		 the growing number of images as a given, we aim to reduce the 
  		 image feature dimensionality in this paper. We propose reduced 
  		 linear kernels that use only a portion of the dimensions to 
  		 reconstruct a linear kernel. We formulate the search for these 
  		 dimensions as a convex optimization problem, which can be solved 
  		 efficiently. Different from existing kernel reduction methods, 
  		 our reduced kernels are faster and maintain the accuracy benefits 
  		 from non-linear embedding methods that mimic non-linear SVMs. We 
  		 show these properties on both the Scenes and PASCAL VOC 2007 
  		 datasets. In addition, we demonstrate how our reduced kernels 
  		 allow to compress Fisher vector for use with non-linear 
  		 embeddings, leading to high accuracy. What is more, without using 
  		 any labeled examples the selected and weighed kernel dimensions 
  		 appear to correspond to visually meaningful patches in the images. 
		 
  		}
}
@inproceedings{LiICMR12,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  title = {Fusing Concept Detection and Geo Context for Visual Search},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {June},
  year = {2012},
  pages = {},
  address = {Hong Kong, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-geo-context-icmr2012.pdf},
  note = {Best paper runner-up},
  abstract = {
  		 
  		 Given the proliferation of geo-tagged images, the question of
		 how to exploit geo tags and the underlying geo context for
		 visual search is emerging. Based on the observation that the
		 importance of geo context varies over concepts, we propose
		 a concept-based image search engine which fuses visual concept
		 detection and geo context in a concept-dependent manner.
		 Compared to individual content-based and geo-based
		 concept detectors and their uniform combination, concept-dependent
		 fusion shows improvements. Moreover, since the
		 proposed search engine is trained on social-tagged images
		 alone without the need of human interaction, it is flexible
		 to cope with many concepts. Search experiments on
		 101 popular visual concepts justify the viability of the proposed
		 solution. In particular, for 79 out of the 101 concepts,
		 the learned weights yield improvements over the uniform
		 weights, with a relative gain of at least 5\% in terms of average
		 precision.
  		 
  		}
}
@inproceedings{VreeswijkICMR12,
  author = {Daan T. J. Vreeswijk and Koen E. A. van de Sande and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {All Vehicles are Cars: Subclass Preferences in Container Concepts},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {June},
  year = {2012},
  pages = {},
  address = {Hong Kong, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/vreeswijk-vehicles-are-cars-icmr2012.pdf},
  abstract = {
  		 
  		 This paper investigates the natural bias humans display
		 when labeling images with a container label like vehicle or
		 carnivore. Using three container concepts as subtree root
		 nodes, and all available concepts between these roots and
		 the images from the ImageNet Large Scale Visual Recogni-
		 tion Challenge (ILSVRC) dataset, we analyze the differences
		 between the images labeled at these varying levels of 
		 abstraction and the union of their constituting leaf nodes. 
		 We find  that for many container concepts, a strong preference 
		 for one or a few different constituting leaf nodes occurs. 
		 These results indicate that care is needed when using hierarchical
		 knowledge in image classification: if the aim is to classify
		 vehicles the way humans do, then cars and buses may be
		 the only correct results.
  		 
  		}
}
@inproceedings{FreiburgACM11,
  author = {Bauke Freiburg and Jaap Kamps and Cees G. M. Snoek},
  title = {Crowdsourcing Visual Detectors for Video Search},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  month = {December},
  year = {2011},
  pages = {},
  address = {Scottsdale, AZ, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/freiburg-crowdsourcing-acm2011.pdf},
  abstract = {
  
  		 In this paper, we study social tagging at the video fragment-level
		 using a combination of automated content understanding and the
		 wisdom of the crowds. We are interested in the question whether
		 crowdsourcing can be beneficial to a video search engine that
		 automatically recognizes video fragments on a semantic level. To
		 answer this question, we perform a 3-month online field study with
		 a concert video search engine targeted at a dedicated
		 user-community of pop concert enthusiasts. We harvest the feedback
		 of more than 500 active users and perform two experiments. In
		 experiment 1 we measure user incentive to provide feedback, in
		 experiment 2 we determine the tradeoff between feedback quality
		 and quantity when aggregated over multiple users. Results show
		 that users provide sufficient feedback, which becomes highly
		 reliable when a crowd agreement of 67\% is enforced.
  		 
  		}
}
@inproceedings{LiACM11,
  author = {Xirong Li and Efstratios Gavves and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  title = {Personalizing Automated Image Annotation using Cross-Entropy},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  month = {December},
  year = {2011},
  pages = {},
  address = {Scottsdale, AZ, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-personalized-acm2011.pdf},
  abstract = {
  		 
  		 Annotating the increasing amounts of user-contributed images
		 in a personalized manner is in great demand. However,
		 this demand is largely ignored by the mainstream of automated
		 image annotation research. In this paper we aim
		 for personalizing automated image annotation by jointly exploiting
		 personalized tag statistics and content-based image
		 annotation. We propose a cross-entropy based learning algorithm
		 which personalizes a generic annotation model by
		 learning from a user’s multimedia tagging history. Using
		 cross-entropy-minimization basedMonte Carlo sampling, the
		 proposed algorithm optimizes the personalization process in
		 terms of a performance measurement which can be flexibly
		 chosen. Automatic image annotation experiments with
		 5,315 realistic users in the social web show that the proposed
		 method compares favorably to a generic image annotation
		 method and a method using personalized tag statistics only.
		 For 4,442 users the performance improves, where for 1,088
		 users the absolute performance gain is at least 0.05 in terms
		 of average precision. The results show the value of the proposed 
		 method.
  		 
  		}
}
@inproceedings{LiICMR11,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  title = {Social Negative Bootstrapping for Visual Categorization},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Retrieval},
  month = {April},
  year = {2011},
  pages = {},
  address = {Trento, Italy},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-social-negative-icmr2011.pdf},
  abstract = {
  		 
  		 To learn classifiers for many visual categories, obtaining 
  		 labeled training examples in an efficient way is crucial. Since
		 a classifier tends to misclassify negative examples which are
		 visually similar to positive examples, inclusion of such 
		 informative negatives should be stressed in the learning process.
		 However, they are unlikely to be hit by random sampling,
		 the de facto standard in literature. In this paper, we go
		 beyond random sampling by introducing a novel social 
		 negative bootstrapping approach. Given a visual category and
		 a few positive examples, the proposed approach adaptively
		 and iteratively harvests informative negatives from a large
		 amount of social-tagged images. To label negative examples
		 without human interaction, we design an effective virtual
		 labeling procedure based on simple tag reasoning. Virtual
		 labeling, in combination with adaptive sampling, enables us
		 to select the most misclassified negatives as the informative
		 samples. Learning from the positive set and the informative
		 negative sets results in visual classifiers with higher 
		 accuracy. Experiments on two present-day image benchmarks
		 employing 650K virtually labeled negative examples show
		 the viability of the proposed approach. On a popular visual
		 categorization benchmark our precision at 20 increases by
		 34\%, compared to baselines trained on randomly sampled
		 negatives. We achieve more accurate visual categorization
		 without the need of manually labeling any negatives.

  		}
}
@inproceedings{HurstMMM11,
  author = {Wolfgang H\"urst and Cees G. M. Snoek and Willem-Jan Spoel and Mate Tomin},
  title = {Size Matters! How Thumbnail Number, Size, and Motion Influence Mobile Video Retrieval},
  booktitle = {International Conference on MultiMedia Modeling},
  month = {January},
  year = {2011},
  pages = {},
  address = {Taipei, Taiwan},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/huerst-size-matters-mmm2011.pdf},
  demo = {http://vimeo.com/19595895},
  abstract = {
  		 
  		 Various interfaces for video browsing and retrieval have been proposed 
  		 that provide improved usability, better retrieval performance, and richer 
  		 user experience compared to simple result lists that are just sorted by 
  		 relevance. These browsing interfaces take advantage of the rather large 
  		 screen estate on desktop and laptop PCs to visualize advanced 
  		 configurations of thumbnails summarizing the video content. Naturally, 
  		 the usefulness of such screen-intensive visual browsers can be called 
  		 into question when applied on small mobile handheld devices, such as 
  		 smart phones. In this paper, we address the usefulness of thumbnail 
  		 images for mobile video retrieval interfaces. In particular, we 
  		 investigate how thumbnail number, size, and motion influence the 
  		 performance of humans in common recognition tasks. Contrary to widespread 
  		 believe that screens of handheld devices are unsuited for visualizing 
  		 multiple (small) thumbnails simultaneously, our study shows that users 
  		 are quite able to handle and assess multiple small thumbnails at the 
  		 same time, especially when they show moving images. Our results give 
  		 suggestions for appropriate video retrieval interface designs on 
  		 handheld devices. 
  		 
                }
}
@inproceedings{GavvesACM10,
  author = {Efstratios Gavves and Cees G. M. Snoek},
  title = {Landmark Image Retrieval Using Visual Synonyms},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  month = {October},
  year = {2010},
  pages = {},
  address = {Firenze, Italy},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gavves-synonyms-acm10.pdf},
  abstract = {
  		 
  		 In this paper, we consider the incoherence problem of the visual words 
  		 in bag-of-words vocabularies. Different from existing work, which performs 
  		 assignment of words based solely on closeness in descriptor space, we 
  		 focus on identifying pairs of independent, distant words -- the visual 
  		 synonyms -- that are still likely to host image patches with similar 
  		 appearance. To study this problem, we focus on landmark images, where 
  		 we can examine whether image geometry is an appropriate vehicle for 
  		 detecting visual synonyms. We propose an algorithm for the extraction 
  		 of visual synonyms in landmark images. To show the merit of visual 
  		 synonyms, we perform two experiments. We examine closeness of synonyms 
  		 in descriptor space and we show a first application of visual synonyms 
  		 in a landmark image retrieval setting. Using visual synonyms, we perform 
  		 on par with the state-of-the-art, but with six times less visual words.
  		 
  		}
}
@inproceedings{HurstACM10,
  author = {Wolfgang H\"urst and Cees G. M. Snoek and Willem-Jan Spoel and Mate Tomin},
  title = {Keep Moving! Revisiting Thumbnails for Mobile Video Retrieval},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  month = {October},
  year = {2010},
  pages = {},
  address = {Firenze, Italy},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/huerst-keep-moving-acm2010.pdf},
  demo = {http://vimeo.com/19595895},
  abstract = {
  		 
  		 Motivated by the increasing popularity of video on handheld
		 devices and the resulting importance for effective video retrieval,
		 this paper revisits the relevance of thumbnails in a mobile video
		 retrieval setting. Our study indicates that users are quite able to
		 handle and assess small thumbnails on a mobile's screen --
		 especially with moving images -- suggesting promising avenues
		 for future research in design of mobile video retrieval interfaces.
		 
		}
}
@inproceedings{SandeCVGPU10,
  author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  title = {Accelerating Visual Categorization with the {GPU}},
  booktitle = {{ECCV} Workshop on Computer Vision on {GPU}},
  pages = {},
  month = {September},
  year = {2010},
  address = {Crete, Greece},
  pdf = {},
  abstract = {
      		   		 
  		}
}
@inproceedings{HuurninkCIVR10,
  author = {Bouke Huurnink and Cees G. M. Snoek and Maarten {de Rijke} and Arnold W. M. Smeulders},
  title = {Today's and Tomorrow's Retrieval Practice in the Audiovisual Archive},
  booktitle = {Proceedings of the {ACM} International Conference on Image and Video Retrieval},
  pages = {18--25},
  month = {July},
  year = {2010},
  address = {Xi'an, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/huurnink-archive-civr2010.pdf},
  abstract = {
  		 
  		 Content-based video retrieval is maturing to the point where 
  		 it can be used in real-world retrieval practices. One such 
  		 practice is the audiovisual archive, whose users increasingly 
  		 require fine-grained access to broadcast television content. 
  		 We investigate to what extent content-based video retrieval 
  		 methods can improve search in the audiovisual archive. In 
  		 particular, we propose an evaluation methodology tailored to 
  		 the specific needs and circumstances of the audiovisual archive, 
  		 which are typically missed by existing evaluation initiatives. 
  		 We utilize logged searches and content purchases from an 
  		 existing audiovisual archive to create realistic query sets 
  		 and relevance judgments. To reflect the retrieval practice of 
  		 both the archive and the video retrieval community as closely 
  		 as possible, our experiments with three video search engines 
  		 incorporate archive-created catalog entries as well as 
  		 state-of-the-art multimedia content analysis results. We find 
  		 that incorporating content-based video retrieval into the 
  		 archive’s practice results in significant performance increases 
  		 for shot retrieval and for retrieving entire television programs. 
  		 Our experiments also indicate that individual content-based 
  		 retrieval methods yield approximately equal performance gains. 
  		 We conclude that the time has come for audiovisual archives to 
  		 start accommodating content-based video retrieval methods into 
  		 their daily practice.
  		   		 
  		}
}
@inproceedings{LiACM09,
  author = {Xirong Li and Cees G. M. Snoek},
  title = {Visual Categorization with Negative Examples for Free},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  pages = {},
  month = {October},
  year = {2009},
  address = {Beijing, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-negative-for-free-acm2009.pdf},
  abstract = {
  		 
  		 Automatic visual categorization is critically dependent on labeled examples 
  		 for supervised learning. As an alternative to traditional expert labeling, 
  		 social-tagged multimedia is becoming a novel yet subjective and inaccurate 
  		 source of learning examples. Different from existing work focusing on collecting 
  		 positive examples, we study in this paper the potential of substituting social 
  		 tagging for expert labeling for creating negative examples. We present an 
  		 empirical study using 6.5 million Flickr photos as a source of social tagging. 
  		 Our experiments on the PASCAL VOC challenge 2008 show that with a relative loss 
  		 of only 4.3\% in terms of mean average precision, expert-labeled negative 
  		 examples can be completely replaced by social-tagged negative examples for 
  		 consumer photo categorization. 
  		   		 
  		}
}
@inproceedings{SetzICME09,
  author = {Arjan T. Setz and Cees G. M. Snoek},
  title = {Can Social Tagged Images Aid Concept-Based Video Search?},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {1460--1463},
  month = {June--July},
  year = {2009},
  address = {},
  note = {Invited paper.},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/setz-social-tags-icme2009.pdf},
  abstract = {
  
		 This paper seeks to unravel whether commonly available social tagged
		 images can be exploited as a training resource for concept-based
		 video search. Since social tags are known to be ambiguous, overly
		 personalized, and often error prone, we place special emphasis on
		 the role of disambiguation. We present a systematic experimental
		 study that evaluates concept detectors based on social tagged
		 images, and their disambiguated versions, in three application
		 scenarios: within-domain, cross-domain, and together with an
		 interacting user. The results indicate that social tagged images can
		 aid concept-based video search indeed, especially after
		 disambiguation and when used in an interactive video retrieval
		 setting. These results open-up interesting avenues for future
		 research.
		 
		}
}
@inproceedings{LiICASSP09,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
  title = {Annotating Images by Harnessing Worldwide User-Tagged Photos},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  pages = {},
  month = {April},
  year = {2009},
  address = {Taipei, Taiwan},
  note = {Invited paper.},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-annotating-images-icassp2009.pdf},
  abstract = {
  		 
  		 Automatic image tagging is important yet challenging due to the
		 semantic gap and the lack of learning examples to model a tag's
		 visual diversity. Meanwhile, social user tagging is creating rich
		 multimedia content on the web. In this paper, we propose to combine
		 the two tagging approaches in a search-based framework. For an
		 unlabeled image, we first retrieve its visual neighbors from a large
		 user-tagged image database. We then select relevant tags from the
		 result images to annotate the unlabeled image. To tackle the
		 unreliability and sparsity of user tagging, we introduce a
		 joint-modality tag relevance estimation method which efficiently
		 addresses both textual and visual clues. Experiments on 1.5 million
		 Flickr photos and 10 000 Corel images verify the proposed method.

		}
}
@inproceedings{ByrneSAMT08,
  author = {Daragh Byrne and Aiden R. Doherty and Cees G. M. Snoek and Gareth J. F. Jones and Alan F. Smeaton},
  title = {Validating the Detection of Everyday Concepts in Visual Lifelogs},
  booktitle = {Proceedings of the International Conference on Semantic and Digital Media Technologies, SAMT 2008, Koblenz, Germany, December 3-5, 2008},
  publisher = {Springer-Verlag},
  series = {LNCS},
  editors = {},
  pages = {15--30},
  month = {December},
  year = {2008},
  address = {Berlin, Germany},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/byrne-everyday-concepts-samt2008.pdf},
  abstract = {
  		 
  		 The Microsoft SenseCam is a small lightweight wearable camera used 
  		 to passively capture photos and other sensor readings from a user's 
  		 day-to-day activities. It can capture up to 3,000 images per day, 
  		 equating to almost 1 million images per year. It is used to aid memory 
  		 by creating a personal multimedia lifelog, or visual recording of 
  		 the wearer's life. However the sheer volume of image data captured 
  		 within a visual lifelog creates a number of challenges, particularly 
  		 for locating relevant content. Within this work, we explore the 
  		 applicability of semantic concept detection, a method often used 
  		 within video retrieval, on the novel domain of visual lifelogs. A 
  		 concept detector models the correspondence between low-level visual 
  		 features and high-level semantic concepts (such as indoors, outdoors, 
  		 people, buildings, etc.) using supervised machine learning. By doing 
  		 so it determines the probability of a concept's presence. We apply 
  		 detection of 27 everyday semantic concepts on a lifelog collection 
  		 composed of 257,518 SenseCam images from 5 users. The results were 
  		 then evaluated on a subset of 95,907 images, to determine the 
  		 precision for detection of each semantic concept and to draw some 
  		 interesting inferences on the lifestyles of those 5 users. We 
  		 additionally present future applications of concept detection within 
  		 the domain of lifelogging.
  		
  		}
}
@inproceedings{LiMIR08,
  author = {Xirong Li and Cees G. M. Snoek and Marcel Worring},
  title = {Learning Tag Relevance by Neighbor Voting for Social Image Retrieval},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia Information Retrieval},
  pages = {180--187},
  month = {October},
  year = {2008},
  address = {Vancouver, Canada},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/li-tag-relevance-mir2008.pdf},
  abstract = {
  		 
  		 Social image retrieval is important for exploiting the increasing 
  		 amounts of amateur-tagged multimedia such as Flickr images. Since 
  		 amateur tagging is known to be uncontrolled, ambiguous, and 
  		 personalized, a fundamental problem is how to reliably interpret 
  		 the relevance of a tag with respect to the visual content it is 
  		 describing. Intuitively, if different persons label similar images 
  		 using the same tags, these tags are likely to reflect objective 
  		 aspects of the visual content. Starting from this intuition, we 
  		 propose a novel algorithm that scalably and reliably learns tag 
  		 relevance by accumulating votes from visually similar neighbors. 
  		 Further, treated as tag frequency, learned tag relevance is 
  		 seamlessly embedded into current tag-based social image retrieval 
  		 paradigms. Preliminary experiments on one million Flickr images 
  		 demonstrate the potential of the proposed algorithm. Overall 
  		 comparisons for both single-word queries and multiple-word queries 
  		 show substantial improvement over the baseline by learning and using 
  		 tag relevance. Specifically, compared with the baseline using the 
  		 original tags, on average, retrieval using improved tags increases 
  		 mean average precision by 24\%, from 0.54 to 0.67. Moreover, 
  		 simulated experiments indicate that performance can be improved 
  		 further by scaling up the amount of images used in the proposed 
  		 neighbor voting algorithm. 
  		   		 
  		}
}
@inproceedings{RooijCIVR08,
  author = {Ork de Rooij and Cees G. M. Snoek and Marcel Worring},
  title = {Balancing Thread Based Navigation for Targeted Video Search},
  booktitle = {Proceedings of the {ACM} International Conference on Image and Video Retrieval},
  pages = {485--494},
  month = {July},
  year = {2008},
  address = {Niagara Falls, Canada},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/rooij-thread-based-navigation-civr2008.pdf},
  abstract = {
  		
  		 Various query methods for video search exist. Because of the
		 semantic gap each method has its limitations. We argue that
		 for effective retrieval query methods need to be combined at
		 retrieval time. However, switching query methods often involves
		 a change in query and browsing interface, which puts
		 a heavy burden on the user. In this paper, we propose a
		 novel method for fast and effective search trough large video
		 collections by embedding multiple query methods into a single
		 browsing environment. To that end we introduced the
		 notion of query threads, which contain a shot-based ranking
		 of the video collection according to some feature-based
		 similarity measure. On top of these threads we define several
		 thread-based visualizations, ranging from fast targeted
		 search to very broad exploratory search, with the ForkBrowser 
		 as the balance between fast search and video space
		 exploration. We compare the effectiveness and efficiency of
		 the ForkBrowser with the CrossBrowser on the TRECVID
		 2007 interactive search task. Results show that different
		 query methods are needed for different types of search topics,
		 and that the ForkBrowser requires signifficantly less user
		 interactions to achieve the same result as the CrossBrowser.
		 In addition, both browsers rank among the best interactive
		 retrieval systems currently available.  
  		   		 
  		}
}
@inproceedings{SandeCIVR08,
  author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  title = {A Comparison of Color Features for Visual Concept Classification},
  booktitle = {Proceedings of the {ACM} International Conference on Image and Video Retrieval},
  pages = {141--149},
  month = {July},
  year = {2008},
  address = {Niagara Falls, Canada},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-colorfeatures-civr2008.pdf},
  abstract = {
  
  		 Concept classification is important to access visual information on
		 the level of objects and scene types. So far, intensity-based features
		 have been widely used. To increase discriminative power, color
		 features have been proposed only recently. As many features exist,
		 a structured overview is required of color features in the context of
		 concept classification.
		 Therefore, this paper studies 1. the invariance properties and
		 2. the distinctiveness of color features in a structured way. The
		 invariance properties of color features with respect to photometric
		 changes are summarized. The distinctiveness of color features is
		 assessed experimentally using an image and a video benchmark:
		 the PASCAL VOC Challenge 2007 and the Mediamill Challenge.
		 Because color features cannot be studied independently from the
		 points at which they are extracted, different point sampling strategies
		 based on Harris-Laplace salient points, dense sampling and the
		 spatial pyramid are also studied.
		 From the experimental results, it can be derived that invariance
		 to light intensity changes and light color changes affects concept
		 classification. The results reveal further that the usefulness of 
		 invariance is concept-specific.
    		   		 
  		}
}
@inproceedings{SandeCVPR08,
  author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  title = {Evaluation of Color Descriptors for Object and Scene Recognition},
  booktitle = {Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
  pages = {},
  month = {June},
  year = {2008},
  address = {Anchorage, Alaska},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-colordescriptors-cvpr2008.pdf},
  abstract = {
  
  		 Image category recognition is important to access visual
		 information on the level of objects and scene types. So far,
		 intensity-based descriptors have been widely used. To increase
		 illumination invariance and discriminative power,
		 color descriptors have been proposed only recently. As
		 many descriptors exist, a structured overview of color invariant
		 descriptors in the context of image category recognition
		 is required.
		 Therefore, this paper studies the invariance properties
		 and the distinctiveness of color descriptors in a structured
		 way. The invariance properties of color descriptors are
		 shown analytically using a taxonomy based on invariance
		 properties with respect to photometric transformations. The
		 distinctiveness of color descriptors is assessed experimentally
		 using two benchmarks from the image domain and the
		 video domain.
		 From the theoretical and experimental results, it can be
		 derived that invariance to light intensity changes and light
		 color changes affects category recognition. The results reveal
		 further that, for light intensity changes, the usefulness
		 of invariance is category-specific.  
  		   		 
  		}
}
@inproceedings{SandeCGIV08,
  author = {Koen E. A. van de Sande and Theo Gevers and Cees G. M. Snoek},
  title = {Color Descriptors for Object Category Recognition},
  booktitle = {Proceedings of the {IS\&T} European Conference on Colour in Graphics, Imaging, and Vision},
  pages = {},
  month = {June},
  year = {2008},
  address = {Terrassa-Barcelona, Spain},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sande-color-descriptors-cgiv2008.pdf},
  abstract = {
  
  		 Category recognition is important to access visual information
		 on the level of objects. A common approach is to compute
		 image descriptors first and then to apply machine learning to
		 achieve category recognition from annotated examples. As a 
		 consequence,the choice of image descriptors is of great influence 
		 on the recognition accuracy. So far, intensity-based (e.g. SIFT) 
		 descriptors computed at salient points have been used. However, 
		 color has been largely ignored. The question is, can color 
		 information improve accuracy of category recognition?
		 Therefore, in this paper, we will extend both salient point
		 detection and region description with color information. The 
		 extension of color descriptors is integrated into the framework 
		 of category recognition enabling to select both intensity and 
		 color variants. Our experiments on an image benchmark show that
		 category recognition benefits from the use of color. Moreover,
		 the combination of intensity and color descriptors yields a 30\%
		 improvement over intensity features alone.  
  		   		 
  		}
}
@inproceedings{RooijACM07,
  author = {Ork de Rooij and Cees G. M. Snoek and Marcel Worring},
  title = {Query on Demand Video Browsing},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  pages = {811--814},
  month = {September},
  year = {2007},
  address = {Augsburg, Germany},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/rooij-rotor-acm2007.pdf},
  abstract = {
  
  		 This paper describes a novel method for browsing a large
		 collection of news video by linking various forms of related
		 video fragments together as threads. Each thread contains
		 a sequence of shots with high feature-based similarity. Two
		 interfaces are designed which use threads as the basis for
		 browsing. One interface shows a minimal set of threads,
		 and the other as many as possible. Both interfaces are
		 evaluated in the TRECVID interactive retrieval task, where
		 they ranked among the best interactive retrieval systems
		 currently available. The results indicate that the use of
		 threads in interactive video search is very beneficial. We
		 have found that in general the query result and the timeline
		 are the most important threads. However, having several
		 additional threads allow a user to find unique results which
		 cannot easily be found by using query results and time alone.
  		   		 
  		}
}
@inproceedings{SmeuldersICIAP07,
  author = {Arnold W. M. Smeulders and Jan C. van Gemert and Bouke Huurnink and Dennis C. Koelma and Ork de Rooij and Koen E. A. van de Sande and Cees G. M. Snoek and Cor J. Veenman and Marcel Worring},
  title = {Semantic Video Search},
  booktitle = {International Conference on Image Analysis and Processing},
  pages = {},
  month = {September},
  year = {2007},
  address = {Modena, Italy},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/smeulders-search-iciap2007.pdf},
  abstract = {
  
                 In this paper we describe the current performance of our MediaMill
		 system as presented in the TRECVID 2006 benchmark for video search
		 engines. The MediaMill team participated in two tasks: concept
		 detection and search. For concept detection we use the MediaMill
		 Challenge as experimental platform. The MediaMill Challenge divides
		 the generic video indexing problem into a visual-only, textual-only,
		 early fusion, late fusion, and combined analysis experiment. We
		 provide a baseline implementation for each experiment together with
		 baseline results. We extract image features, on global, regional,
		 and keypoint level, which we combine with various supervised
		 learners. A late fusion approach of visual-only analysis methods
		 using geometric mean was our most successful run. With this run we
		 conquer the Challenge baseline by more than 50\%. Our concept
		 detection experiments have resulted in the best score for three
		 concepts: i.e. \emph{desert}, \emph{flag us}, and \emph{charts}.
		 What is more, using LSCOM annotations, our visual-only approach
		 generalizes well to a set of 491 concept detectors. To handle such a
		 large thesaurus in retrieval, an engine is developed which allows
		 users to select relevant concept detectors based on interactive
		 browsing using advanced visualizations. Similar to previous years
		 our best interactive search runs yield top performance, ranking 2nd
		 and 6th overall.    
  		 
  		}
}
@inproceedings{SnoekICME07b,
  author = {Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders and Bauke Freiburg},
  title = {The Role of Visual Content and Style for Concert Video Indexing},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {252--255},
  month = {July},
  year = {2007},
  address = {Beijing, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-fabchannel-icme2007.pdf},
  abstract = {
  
  		 This paper contributes to the automatic indexing of concert video.
		 In contrast to traditional methods, which rely primarily on audio
		 information for summarization applications, we explore how a
		 visual-only concept detection approach could be employed. We
		 investigate how our recent method for news video indexing -- which
		 takes into account the role of content and style -- generalizes to
		 the concert domain. We analyze concert video on three levels of
		 visual abstraction, namely: content, style, and their fusion.
		 Experiments with 12 concept detectors, on 45 hours of visually
		 challenging concert video, show that the automatically learned best
		 approach is concept-dependent. Moreover, these results suggest that
		 the visual modality provides ample opportunity for more effective
		 indexing and retrieval of concert video when used in addition to the
		 auditory modality.
  		   		 
  		}
}
@inproceedings{SnoekICME07a,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {Are Concept Detector Lexicons Effective for Video Search?},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {1966--1969},
  month = {July},
  year = {2007},
  address = {Beijing, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-concept-icme2007.pdf},
  abstract = {
  
  		 Until now, systematic studies on the effectiveness of concept
		 detectors for video search have been carried out using less than 20
		 detectors, or in combination with other retrieval techniques. We
		 investigate whether video search using just large concept detector
		 lexicons is a viable alternative for present day approaches. We
		 demonstrate that increasing the number of concept detectors in a
		 lexicon yields improved video retrieval performance indeed. In
		 addition, we show that combining concept detectors at query time has
		 the potential to boost performance further. We obtain the
		 experimental evidence on the automatic video search task of TRECVID
		 2005 using 363 machine learned concept detectors.
  		   		 
  		}
}
@inproceedings{WorringICASSP07,
  author = {Marcel Worring and Cees G. M. Snoek and Ork de Rooij and Giang P. Nguyen and Arnold W. M. Smeulders},
  title = {The {MediaMill} Semantic Video Search Engine},
  booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech, and Signal Processing},
  pages = {--},
  month = {April},
  year = {2007},
  address = {Honolulu, Hawaii, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/worring-mediamill-icassp2007.pdf},
  note = {\emph{Invited paper}},
  abstract = {
                 
                 In this paper we present the methods underlying the MediaMill
		 semantic video search engine. The basis for the engine
		 is a semantic indexing process which is currently based on
		 a lexicon of 491 concept detectors. To support the user in
		 navigating the collection, the system defines a visual similarity
		 space, a semantic similarity space, a semantic thread
		 space, and browsers to explore them. We compare the different
		 browsers and their utility within the TRECVID benchmark.
		 In 2005, We obtained a top-3 result for 19 out of 24
		 search topics. In 2006 for 14 out of 24.
    		 		
		}
}
@inproceedings{NguyenMIR06,
  author = {Giang P. Nguyen and Marcel Worring and Arnold W. M. Smeulders},
  title = {Similarity learning via dissimilarity space in {CBIR}},
  booktitle = {Proceedings of the {ACM} {SIGMM} International Workshop on Multimedia Information Retrieval},
  pages = {107--116},
  month = {October},
  year = {2006},
  address = {Santa Barbara, USA},
  pdf = {http://www.science.uva.nl/research/mediamill/pub/nguyen-dissimilarity-mir2006.pdf},
  abstract = {
  
  		 In this paper, we introduce a new approach to learn dissimilarity for 
  		 interactive search in content based image retrieval. In literature, 
  		 dissimilarity is often learned via the feature space by feature selection, 
  		 feature weighting or a parameterized function of the features. Different 
  		 from existing techniques, we use relevance feedback to adjust dissimilarity 
  		 in a dissimilarity space. To create a dissimilarity space, we use 
  		 Pekalska’s method [15]. After the user gives feedback, we apply active 
  		 learning with one-class SVM on this space. Results on a Corel dataset 
  		 of 10000 images and a TrecVid collection of 43907 keyframes show that 
  		 our proposed approach can improve the retrieval performance over the 
  		 feature space based approach.
  		 		
		}
}
@inproceedings{SnoekACM06,
  author = {Cees G. M. Snoek and Marcel Worring and Jan C. van Gemert and Jan-Mark Geusebroek and Arnold W. M. Smeulders},
  title = {The Challenge Problem for Automated Detection of 101 Semantic Concepts in Multimedia},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  pages = {421--430},
  month = {October},
  year = {2006},
  address = {Santa Barbara, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-challenge-acm2006.pdf},
  abstract = {
  
  		 We introduce the challenge problem for generic video indexing to
		 gain insight in intermediate steps that affect performance of
		 multimedia analysis methods, while at the same time fostering
		 repeatability of experiments. To arrive at a challenge problem, we
		 provide a general scheme for the systematic examination of automated
		 concept detection methods, by decomposing the generic video indexing
		 problem into 2 unimodal analysis experiments, 2 multimodal analysis
		 experiments, and 1 combined analysis experiment. For each
		 experiment, we evaluate generic video indexing performance on 85
		 hours of international broadcast news data, from the TRECVID
		 2005/2006 benchmark, using a lexicon of 101 semantic concepts. By
		 establishing a minimum performance on each experiment, the challenge
		 problem allows for component-based optimization of the generic
		 indexing issue, while simultaneously offering other researchers a
		 reference for comparison during indexing methodology development. To
		 stimulate further investigations in intermediate analysis steps that
		 influence video indexing performance, the challenge offers to the
		 research community a manually annotated concept lexicon,
		 pre-computed low-level multimedia features, trained classifier
		 models, and five experiments together with baseline performance, 
		 which are all available at http://www.mediamill.nl/challenge/.  		 
  		 
  		}
}
@inproceedings{GemertACM06,
  author = {Jan C. van Gemert and Cees G. M. Snoek and Cor Veenman and Arnold W. M. Smeulders},
  title = {The Influence of Cross-Validation on Video Classification Performance},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  pages = {695--698},
  month = {October},
  year = {2006},
  address = {Santa Barbara, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gemert-crossvalidation-acm2006.pdf},
  abstract = {
  
  		 Digital video is sequential in nature. When video data is used in a 
  		 semantic concept classification task, the episodes are usually summarized 
  		 with shots. The shots are annotated as containing, or not containing, 
  		 a certain concept resulting in a labeled dataset. These labeled shots 
  		 can subsequently be used by supervised learning methods (classifiers) 
  		 where they are trained to predict the absence or presence of the concept 
  		 in unseen shots and episodes. The performance of such automatic 
  		 classification systems is usually estimated with cross-validation. By 
  		 taking random samples from the dataset for training and testing as such, 
  		 part of the shots from an episode are in the training set and another 
  		 part from the same episode is in the test set. Accordingly, data 
  		 dependence between training and test set is introduced, resulting in 
  		 too optimistic performance estimates. In this paper, we experimentally 
  		 show this bias, and propose how this bias can be prevented using 
  		 "episode-constrained" cross-validation. Moreover, we show that a 15\% 
  		 higher classifier performance can be achieved by using episode 
  		 constrained cross-validation for classifier parameter tuning.
  		   		 
  		}
}
@inproceedings{GeusebroekBMVC06,
  author = {Jan-Mark Geusebroek},
  title = {Compact Object Descriptors from Local Colour Invariant Histograms},
  booktitle = {British Machine Vision Conference},
  pages = {},
  month = {September},
  year = {2006},
  address = {Edinburgh, UK},
  pdf = {http://www.science.uva.nl/~mark/pub/2006/GeusebroekBMVC06.pdf},
  abstract = {
        	 
        	 Much emphasis has recently been placed on the detection and recognition
        	 of locally (weak) affine invariant region descriptors for object
        	 recognition. In this paper, we take recognition one step further by
        	 developing features for non-planar objects. We consider the description
        	 of objects with locally smoothly varying surface. For this class of
        	 objects, colour invariant histogram matching has proven to be very
        	 encouraging. However, matching many local colour cubes is
        	 computationally demanding. We propose a compact colour descriptor,
        	 which we call Wiccest, requiring only 12 numbers to locally capture
        	 colour and texture information. The Wiccest features are shown to be
        	 fairly insensitive to photometric effects like shadow, shading, and
        	 illumination colour. Moreover, we demonstrate the features to be
        	 applicable to highly compressed images while retaining discriminative
        	 power.
  
  		}
}
@inproceedings{WorringICPR06,
  author = {Marcel Worring and Cees G. M. Snoek and Ork de Rooij and Giang P. Nguyen and Dennis C. Koelma},
  title = {Lexicon-based Browsers for Searching in News Video Archives},
  booktitle = {Proceedings of the International Conference on Pattern Recognition},
  pages = {1256--1259},
  month = {August},
  year = 2006,
  address = {Hong Kong, China},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/worring-browsers-icpr2006.pdf},
  abstract = {
  
  		 In this paper we present the methods and visualizations used in the
		 MediaMill video search engine. The basis for the engine is a semantic
		 indexing process which derives a lexicon of 101 concepts. To support
		 the user in navigating the collection, the system defines a visual similarity
		 space, a semantic similarity space, a semantic thread space, and
		 browsers to explore them. The search system is evaluated within the
		 TRECVID benchmark. We obtain a top-3 result for 19 out of 24 search
		 topics. In addition, we obtain the highest mean average precision of
		 all search participants.		
		
  		}
}
@inproceedings{SnoekCIVR06,
  author = {Cees G. M. Snoek and Marcel Worring and Dennis C. Koelma and Arnold W. M. Smeulders},
  title = {Learned Lexicon-driven Interactive Video Retrieval},
  booktitle = {Proceedings of the International Conference on Image and Video Retrieval, CIVR 2006, Tempe, Arizona, July 13-15, 2006},
  editor = {H. Sundaram and others},
  series = {LNCS},
  volume = {4071},
  pages = {11--20},
  publisher = {Springer-Verlag},
  address = {Heidelberg, Germany},
  month = {July},
  year = 2006,
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-lexicon-civr2006.pdf},
  abstract = {
  		 
  		 We combine in this paper automatic learning of a large lexicon of
		 semantic concepts with traditional video retrieval methods into a
		 novel approach to narrow the semantic gap. The core of the proposed
		 solution is formed by the automatic detection of an unprecedented
		 lexicon of 101 concepts. From there, we explore the combination of
		 query-by-concept, query-by-example, query-by-keyword, and user
		 interaction into the \emph{MediaMill} semantic video search engine.
		 We evaluate the search engine against the 2005 NIST TRECVID video
		 retrieval benchmark, using an international broadcast news archive
		 of 85 hours. Top ranking results show that the lexicon-driven search
		 engine is highly effective for interactive video retrieval. 		
		
  		}
}
@inproceedings{SnoekICME06,
  author = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra and Arnold W. M. Smeulders},
  title = {The Semantic Pathfinder for Generic News Video Indexing},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {},
  month = {July},
  year = {2006},
  address = {Toronto, Canada},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-pathfinder-icme2006.pdf},
  abstract = {
  		 
		 This paper presents the semantic pathfinder architecture for
		 generic indexing of video archives. The pathfinder automatically
		 extracts semantic concepts from video based on the exploration of
		 different paths through three consecutive analysis steps, closely
		 linked to the video production process, namely: content analysis, 
		 style analysis, and context analysis. The virtue of the semantic
		 pathfinder is its learned ability to find a best path of analysis
		 steps on a per-concept basis. To show the generality of this
		 indexing approach we develop detectors for a lexicon of 32
		 concepts and we evaluate the semantic pathfinder against the 2004
		 NIST TRECVID video retrieval benchmark, using a news archive of 64
		 hours. Top ranking performance indicates the merit of the semantic
		 pathfinder.
  		 
  		}
}
@inproceedings{GemertSLAM06,
  author = {{Jan C. van} Gemert and Jan-Mark Geusebroek and Cor J. Veenman and Cees G. M. Snoek and Arnold W. M. Smeulders},
  title = {Robust Scene Categorization by Learning Image Statistics in Context},
  booktitle = {Int'l Workshop on Semantic Learning Applications in Multimedia, in conjunction with {CVPR'06}},
  pages = {},
  month = {June},
  year = {2006},
  address = {New York, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/gemert-scene-slam2006.pdf},
  abstract = {
  		
  		 We present a generic and robust approach for scene categorization. 
  		 A complex scene is described by proto-concepts like vegetation, 
  		 water, fire, sky etc. These proto-concepts are represented by low 
  		 level features, where we use natural images statistics to compactly 
  		 represent color invariant texture information by a Weibull distribution. 
  		 We introduce the notion of contextures which preserve the context of 
  		 textures in a visual scene with an occurrence histogram (context) of
  		 similarities to proto-concept descriptors (texture). In contrast to 
  		 a codebook approach, we use the similarity to all vocabulary elements 
  		 to generalize beyond the code words. Visual descriptors are attained 
  		 by combining different types of contexts with different texture 
  		 parameters. The visual scene descriptors are generalized to visual 
  		 categories by training a support vector machine. We evaluate our 
  		 approach on 3 different datasets: 1) 50 categories for the TRECVID 
  		 video dataset; 2) the Caltech 101-object images; 3) 89 categories 
  		 being the intersection of the Corel photo stock with the Art Explosion 
  		 photo stock. Results show that our approach is robust over different 
  		 datasets, while maintaining competitive performance. 
   
  		}
}
@inproceedings{SmeuldersISCCSP06,
  author = {Arnold W. M. Smeulders and Jan van Gemert and Jan-Mark Geusebroek and Cees Snoek and Marcel Worring},
  title = {Browsing for the National Dutch Video Archive},
  booktitle = {ISCCSP2006},
  pages = {},
  month = {March},
  year = {2006},
  address = {Marrakech, Morocco},
  pdf = {http://www.science.uva.nl/~smeulder/pubs/ISCCSP2006SmeuldersTEMP.pdf},
  abstract = {
 		
 		 Pictures have always been a prime carrier of Dutch culture. But pictures 
 		 take a new form. We live in times of broad- and narrowcasting through 
 		 Internet, of passive and active viewers, of direct or delayed broadcast, 
 		 and of digital pictures being delivered in the museum or at home. At the 
 		 same time, the picture and television archives turn digital. Archives are 
 		 going to be swamped with information requests unless they swiftly adapt 
 		 to partially automatic annotation and digital retrieval. Our aim is to 
 		 provide faster and more complete access to picture archives by digital 
 		 analysis. Our approach consists of a multi-media analysis of features of 
 		 pictures in tandem with the language that describes those pictures, under 
 		 the guidance of a visual ontology. The general scientific paradigm we 
 		 address is the detection of directly observables fused into semantic 
 		 features learned from large repositories of digital video. We use 
 		 invariant, natural-image statisticsbased contextual feature sets for 
 		 capturing the concepts of images and integrate that as early as possible 
 		 with text. The system consists of a large for science yet small for 
 		 practice set of visual concepts permitting the retrieval of semantically 
 		 formulated queries. We will demonstrate a PC-based, off-line trained state 
 		 of the art system for browsing broadcast news-archives.  		   		 
 		 
  		}
}
@inproceedings{NguyenMIR05,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Scenario optimization for interactive category search},
  booktitle = {Proceedings of the {ACM} {SIGMM} International Workshop on Multimedia Information Retrieval},
  pages = {},
  month = {November},
  year = {2005},
  address = {Singapore},
  pdf = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2005/giangnpMIR05.pdf},
  abstract = {
  		 
  		 Most of the existing work in interactive content based retrieval
		 concentrates on machine learning methods for effective use of
		 relevance feedback. On the other end of the spectrum, the
		 information visualization community focusses on effective methods
		 for conveying information to the user. What lacks is research
		 considering the information visualization and interactive content
		 based retrieval as truly integrated parts of one search system. In
		 such an integrated system there are many degrees of freedom like
		 the number of images to display, the image size, different
		 visualization modes, and possible feedback modes. To find optimal
		 values for all of those using user studies is unfeasible. We
		 therefore develop scenarios in which tasks and user actions are
		 simulated. These are then optimized based on objective constraints
		 and evaluation criteria. In such a manner the degrees of freedom
		 are reduced and the remaining degrees can be evaluated in user
		 studies. In this paper we present a system which integrates
		 advanced similarity based visualization with active learning. We
		 have performed extensive scenario based experimentation on an
		 interactive category search task. The results show that indeed the
		 use of advanced visualization and active learning pays off.
		
		}
}
@inproceedings{HollinkACM05,
  author = {Laura Hollink and Marcel Worring and Guus Schreiber},
  title = {Building a Visual Ontology for Video Retrieval},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  pages = {479--482},
  month = {November},
  year = {2005},
  address = {Singapore},
  pdf = {http://www.cs.vu.nl/~guus/papers/Hollink05b.pdf},
  abstract = {
  
  		To ensure access to growing video collections, annotation is becoming more
		and more important. Using background knowledge in the form of ontologies
		or thesauri is a way to facilitate annotation in a broad domain. Current
		ontologies are not suitable for (semi-) automatic annotation of visual
		resources as they contain little visual information about the concepts
		they describe. We investigate how an ontology that does contain visual
		information can facilitate annotation in a broad domain and identify
		requirements that a visual ontology has to meet. Based on these
		requirements, we create a visual ontology out of two existing knowledge
		corpora (WordNet and MPEG-7) by creating links between visualand general
		concepts. We test performance of the ontology on 40 shots of news video, 
		and discuss the added value of each visual property.
  
  		}
}
@inproceedings{SnoekACM05a,
  author = {Cees G. M. Snoek and Marcel Worring and Arnold W. M. Smeulders},
  title = {Early versus Late Fusion in Semantic Video Analysis},
  booktitle = {Proceedings of the {ACM} International Conference on Multimedia},
  pages = {399--402},
  month = {November},
  year = {2005},
  address = {Singapore},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-earlylate-acm2005.pdf},
  abstract = {
  		 
  		 Semantic analysis of multimodal video aims to index segments of interest at a 
  		 conceptual level. In reaching this goal, it requires an analysis of several 
  		 information streams. At some point in the analysis these streams need to be 
  		 fused. In this paper, we consider two classes of fusion schemes, namely early 
  		 fusion and late fusion. The former fuses modalities in feature space, the 
  		 latter fuses modalities in semantic space. We show by experiment on 184 hours 
  		 of broadcast video data and for 20 semantic concepts, that late fusion tends 
  		 to give slightly better performance for most concepts. However, for those 
  		 concepts where early fusion performs better the difference is more significant.
  		 
  		}
}
@inproceedings{SnoekICME05a,
  author = {Cees G. M. Snoek and Marcel Worring and Jan-Mark Geusebroek and Dennis C. Koelma and Frank J. Seinstra},
  title = {On the Surplus Value of Semantic Video Analysis Beyond the Key Frame},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {},
  month = {July},
  year = {2005},
  address = {Amsterdam, The Netherlands},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-surplus-icme2005.pdf},
  abstract = {
  		 
  		 Typical semantic video analysis methods aim for classification of camera shots 
  		 based on extracted features from a single key frame only. In this paper, we 
  		 sketch a video analysis scenario and evaluate the benefit of analysis beyond 
  		 the key frame for semantic concept detection performance. We developed 
  		 detectors for a lexicon of 26 concepts, and evaluated their performance on 
  		 120 hours of video data. Results show that, on average, detection performance 
  		 can increase with almost 40\% when the analysis method takes more visual 
  		 content into account.
  		 
  		}
}
@inproceedings{SnoekGFKL05,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {Multimedia Pattern Recognition in Soccer Video using Time Intervals},
  booktitle = {Classification the Ubiquitous Challenge, Proceedings of the 28th Annual Conference of the Gesellschaft fur Klassifikation e.V., University of Dortmund, March 9-11, 2004},
  publisher = {Springer-Verlag},
  series = {Studies in Classification, Data Analysis, and Knowledge Organization},
  editors = {C. Weihs and W. Gaul},
  pages = {97--108},
  year = {2005},
  address = {Berlin, Germany},
  pdf = {},
  note = {\emph{Invited paper}},
  abstract = {
  		 
  		 We focus on the problem of learning rich semantic patterns from the multimedia 
  		 data associated with broadcast video documents. In this talk we propose a generic 
  		 and flexible framework for produced video classification that is capable to learn 
  		 semantic concepts from multimodal sources based on analyzed style elements. Four 
  		 properties that are indicative for style are identified, i.e. layout, content, 
  		 capture, and concept context. The framework allows for robust classification of 
  		 different semantic concepts in produced video by using a fixed core of common 
  		 layout, content, and capture elements in combination with varying concept specific 
  		 context elements. Concepts are classified using a Stacked Probabilistic Support 
  		 Vector Machine. Results on 120 hours of video data from the 2003 TRECVID benchmark 
  		 show that, by using the proposed framework, several rich semantic concepts in 
  		 broadcast news can be classified with state-of-the-art accuracy.
  		
  		}
}
@inproceedings{NguyenDELOS05,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Similarity based visualization of image collections},
  booktitle = {Proceedings of the 7th International Workshop of the EU Network of Excellence DELOS on Audio-visual Content and Information Visualization in Digital Libraries},
  pages = {},
  month = {May},
  year = {2005},
  address = {Cortona, Italy},
  pdf = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2005/giangnpAVIVDiLib05.pdf},
  abstract = {
  
          	 In literature, few content based multimedia retrieval systems take
  		 the visualization as a tool for exploring the collections.
  		 However, when searching for images without examples to start with,
  		 one needs to explore the data set. Up to now, most available
  		 systems just show random collections of images in 2D grid form.
  		 More recently, advanced techniques have been developed for
  		 browsing based on similarity. However, none of them analyze the
  		 problems that occur when visualizing large visual collections. In
  		 this paper, we make these problems explicit. From there, we
  		 establish three general requirements: overview, visibility, and
  		 data structure preservation. Solutions for each requirement are
  		 proposed. Finally, a system is presented and experimental results
		 are given to demonstrate our theory and approach.
		
		}
}
@inproceedings{SeinstraIPDPS05,
  author = {Frank J. Seinstra and Cees G. M. Snoek and Dennis C. Koelma and Jan-Mark Geusebroek and Marcel Worring},
  title = {User Transparent Parallel Processing of the 2004 {NIST} {TRECVID} Data Set},
  booktitle = {Proceedings of the 19th International Parallel \& Distributed Processing Symposium},
  pages = {},
  month = {April},
  year = {2005},
  address = {Denver, USA},
  pdf = {http://staff.science.uva.nl/~fjseins/Papers/Conferences/ipdps2005.pdf},
  abstract = {
  
  		 The Parallel-Horus framework, developed at the University of Amsterdam, is a 
  		 unique software architecture that allows non-expert parallel programmers to 
  		 develop fully sequential multimedia applications for efficient execution on 
  		 homogeneous Beowulf-type commodity clusters. Previously obtained results for 
  		 realistic, but relatively small-sized applications have shown the feasibility 
  		 of the Parallel-Horus approach, with parallel performance consistently being 
  		 found to be optimal with respect to the abstraction level of message passing 
  		 programs. In this paper we discuss the most serious challenge Parallel-Horus 
  		 has had to deal with so far: the processing of over 184 hours of video included 
  		 in the 2004 NIST TRECVID evaluation, i.e. the de facto international standard 
  		 benchmark for content-based video retrieval.  Our results and experiences 
  		 confirm that Parallel- Horus is a very powerful support-tool for state-of-the-art 
  		 research and applications in multimedia processing.
  		 
		}
}
@inproceedings{HollinkKMSA04,
  author = {Laura Hollink and Giang Nguyen and Guus Schreiber and Jan Wielemaker and Bob Wielinga and Marcel Worring},
  title = {Adding Spatial Semantics to Image Annotations},
  booktitle = {International Workshop on Knowledge Markup and Semantic Annotation},
  address = {Hiroshima, Japan},
  month = {November},
  year = {2004},
  pdf = {http://www.cs.vu.nl/~guus/papers/Hollink04c.pdf},
  abstract = {
  		
  		In this paper we discuss a the support of users in adding spatial
		information semi-automatically to annotations of images. Descriptions of
		objects depicted in an image are extended with information about the
		position of those objects. We distinguish two types of spatial concepts:
		absolute positions of objects (e.g., east, west) and relative spatial
		relations between objects (e.g., left, above). We show the use of a tool
		for a collection of art paintings with preexisting RDF annotations,
		including a list of image objects. First, the tool segments a painting
		into regions. The user selects regions, and labels these with objects from
		the existing annotation. Then, the tool computes absolute positions and
		relative spatial relations of the selected regions, and adds these to the
		annotation. A small evaluation study is reported in which annotations
		generated by the tool are compared to manual annotations by ten 
		volunteers.
  
  		}
}
@inproceedings{HollinkCIVR04,
  author = {Laura Hollink and Giang P.Nguyen and Dennis Koelma and Guus Schreiber and M.Worring},
  title = {User Strategies in Video Retrieval: a Case Study},
  booktitle = {Proceedings of the International Conference on Image and Video Retrieval, CIVR 2004, Dublin, Ireland, July 21-23, 2004},
  editor = {P. Enser and Y. Kompatsiaris and N.E. O'Connor and A.F. Smeaton and A.W. M. Smeulders},
  series = {LNCS},
  volume = 3115,
  pages = {6-14},
  publisher = {Springer-Verlag},
  address = {Heidelberg, Germany},
  year = 2004,
  pdf = {http://www.cs.vu.nl/~guus/papers/Hollink04b.pdf},
  abstract = {
  		
  		In this paper we present the results of a user study that was conducted in
		combination with a submission to TRECVID 2003. Search behavior of students
		querying an interactive video-retrieval system was analyzed. 242 Searches
		by 39 students on 24 topics were assessed. Questionnaire data, logged user
		actions on the system, and a quality mea- sure of each search provided by
		TRECVID were studied. Analysis of the results at various stages in the
		retrieval process suggests that retrieval based on transcriptions of the
		speech in video data adds more to the average precision of the result than
		content-based retrieval. The latter is particularly useful in providing
		the user with an overview of the dataset and thus an indication of the
		success of a search.
		
  		}
}
@inproceedings{NguyenICME04a,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {A user based framework for salient detail extraction},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {},
  month = {June},
  year = {2004},
  address = {Taipei, Taiwan},
  pdf = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2004/giangnpICME04a.pdf},
  abstract = {
  
          	 In this paper, we consider the interaction with salient details in
  		 the image i.e. points, lines, and regions. Interactive salient
  		 detail definition goes further than summarizing the image into a
  		 set of salient details since the saliency of details depends on
  		 the context, the application and the user. We propose an
  		 interaction framework for salient details from the perspective of
  		 the user, which dynamically updates the user- and context-dependent
  		 definition of saliency based on relevance feedback. A number of
  		 instantiations of the framework are presented.
  
  		}
}
@inproceedings{NguyenICME04b,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Optimizing similarity based visualization in content based image retrieval},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {},
  month = {June},
  year = {2004},
  address = {Taipei, Taiwan},
  pdf = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2004/giangnpICME04b.pdf},
  abstract = {
  
          	 In any CBIR system, visualization is important, either to show the
  		 final result to the user or to form the basis for interaction.
  		 Advanced systems use 2-dimensional similarity based visualization
  		 which show not only the information of one image itself but also
  		 the relations between images. A problem in interactive 2D
  		 visualization is the overlap between the images displayed. This
  		 obviously reduces the search capability. Simply spreading the
  		 images on the screen space will not preserve the relations between
  		 them. In this paper, we propose a visualization scheme which
  		 reduces the overlap as well as preserves the general distribution
  		 of the images displayed. Results show that an effective balance
		 between display of structures and limited overlap can be achieved.
  
  		}
}
@inproceedings{SnoekICME04,
  author = {Cees G. M. Snoek and Marcel Worring and Alexander G. Hauptmann},
  title = {Detection of {TV} News Monologues by Style Analysis},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {},
  month = {June},
  year = {2004},
  address = {Taipei, Taiwan},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/snoek-style-icme2004.pdf},
  abstract = {
  		
  		 We propose a method for detection of semantic concepts in produced video 
  		 based on style analysis. Recognition of concepts is done by applying a 
  		 classifier ensemble to the detected style elements. As a case study we 
  		 present a method for detecting the concept of news subject monologues. Our 
  		 approach had the best average precision performance amongst 26 submissions 
  		 in the 2003 TRECVID benchmark.
		
		}
}
@inproceedings{WorringICME04,
  author = {Marcel Worring and Giang P. Nguyen and Laura Hollink and Jan C. van Gemert and Dennis C. Koelma},
  title = {Accessing Video Archives Using Interactive Search},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {},
  month = {June},
  year = {2004},
  address = {Taipei, Taiwan},
  pdf = {http://www.cs.vu.nl/~laurah/1/papers/Worring04_trec.pdf},
  abstract = {
  		
  		In this presentation we present a system for interactive search
		in video archives. In our view interactive search is a fourstep
		process composed of indexing, filtering, browsing, and
		ranking. We have experimentally verified, using 22 groups
		of two participants each, how users apply these steps in the 
		interactive search and how well they perform.
  
  		}
}
@inproceedings{NguyenMIR03,
  author = {Giang P. Nguyen and Marcel Worring},
  title = {Query Definition using Interactive Saliency},
  booktitle = {Proceedings of the {ACM} {SIGMM} International Workshop on Multimedia Information Retrieval},
  pages = {},
  month = {November},
  year = {2003},
  address = {Berkeley, USA},
  pdf = {http://www.science.uva.nl/~giangnp/PUBS/PDF/2003/giangnpMIR03.pdf},
  abstract = {
  
          	 Content-based image retrieval (CBIR) has been under investigation
  		 for a long time with many systems built to meet different
  		 application demands. However, in all systems, there is still a big
  		 gap between the user's expectation and the system's retrieval
  		 capabilities. Therefore, user interaction is an essential
  		 component of any CBIR system. Interaction up to now has mostly
  		 focused on global image features or similarities. We consider the
  		 interaction with salient details in the image i.e. points, lines,
  		 and regions. Interactive salient detail definition goes further
  		 than automatically summarizing the image into a set of salient
  		 details. We aim to dynamically update the user- and
  		 context-dependent definition of saliency based on relevance
  		 feedback from the user. In this paper, we propose an interaction
		 framework for salient details from the perspective of the user.
  
  		}
}
@inproceedings{SnoekICME03a,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {Time Interval Maximum Entropy based Event Indexing in Soccer Video},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  pages = {481--484},
  month = {July},
  year = {2003},
  address = {Baltimore, USA},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/icme2003.pdf},
  abstract = {
  		
  		 Multimodal indexing of events in video documents poses problems with respect 
  		 to representation, inclusion of contextual information, and synchronization 
  		 of the heterogeneous information sources involved. In this paper we present 
  		 the Time Interval Maximum Entropy (TIME) framework that tackles aforementioned 
  		 problems. To demonstrate the viability of TIME for event classification in 
  		 multimodal video, an evaluation was performed on the domain of soccer broadcasts. 
  		 It was found that by applying TIME, the amount of video a user has to watch in 
  		 order to see almost all highlights can be reduced considerably.
  		 
		}
}
@inproceedings{WorringSOFSEM02,
  author = {Marcel Worring and Andrew Bagdanov and Jan van Gemert and Jan-Mark Geusebroek and Minh Hoang and Guus Schreiber and Cees G. M. Snoek and Jeroen Vendrig and Jan Wielemaker and Arnold W. M. Smeulders},
  title = {Interactive Indexing and Retrieval of Multimedia Content},
  booktitle = {Proceedings of the 29th Annual Conference on Current Trends in Theory and Practice of Informatics},
  series = {Lecture Notes in Computer Science},
  volume = {2540},
  pages = {135-148},
  publisher = {Springer-Verlag},
  year = {2002},
  address = {Milovy, Czech Republic},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/sofsem2002.pdf},
  abstract = {
  		 
  		 The indexing and retrieval of multimedia items is difficult due to the semantic 
  		 gap between the user's perception of the data and the descriptions we can derive 
  		 automatically from the data using computer vision, speech recognition, and 
  		 natural language processing. In this contribution we consider the nature of 
  		 the semantic gap in more detail and show examples of methods that help in 
  		 limiting the gap. These methods can be automatic, but in general the indexing 
  		 and retrieval of multimedia items should be a collaborative process between the 
  		 system and the user. We show how to employ the user's interaction for limiting 
  		 the semantic gap.
		
		}
}
@inproceedings{SnoekICME02,
  author = {Cees G. M. Snoek and Marcel Worring},
  title = {A Review on Multimodal Video Indexing},
  booktitle = {Proceedings of the {IEEE} International Conference on Multimedia \& Expo},
  volume = {2},
  pages = {21--24},
  month = {August},
  year = {2002},
  address = {Lausanne, Switzerland},
  pdf = {http://isis-data.science.uva.nl/cgmsnoek/pub/icme2002.pdf},
  abstract = {
  		 
  		 Efficient and effective handling of video documents depends on the availability 
  		 of indexes. Manual indexing is unfeasible for large video collections. Efficient, 
  		 single modality based, video indexing methods have appeared in literature. 
  		 Effective indexing, however, requires a multimodal approach in which either the 
  		 most appropriate modality is selected or the different modalities are used in 
  		 collaborative fashion. In this paper we present a framework for multimodal video 
  		 indexing, which views a video document from the perspective of its author. The 
  		 framework serves as a blueprint for a generic and flexible multimodal video 
  		 indexing system, and generalizes different state-of-the-art video indexing 
  		 methods. It furthermore forms the basis for categorizing these different 
  		 methods.
		
		}
}